[Docs]: Fix Multi-User Port Allocation Conflicts (#3601)
Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com> Co-authored-by: simveit <simp.veitner@gmail.com>
This commit is contained in:
@@ -36,42 +36,70 @@ find . -name '*.ipynb' -exec nbstripout {} \;
|
|||||||
# After these checks pass, push your changes and open a PR on your branch
|
# After these checks pass, push your changes and open a PR on your branch
|
||||||
pre-commit run --all-files
|
pre-commit run --all-files
|
||||||
```
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### **Port Allocation and CI Efficiency**
|
||||||
|
|
||||||
If you need to run and shut down a SGLang server or engine, following these examples:
|
**To launch and kill the server:**
|
||||||
|
|
||||||
1. Launch and close Sever:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
#Launch Sever
|
from sglang.test.test_utils import is_in_ci
|
||||||
|
from sglang.utils import wait_for_server, print_highlight, terminate_process
|
||||||
|
|
||||||
from sglang.utils import (
|
if is_in_ci():
|
||||||
execute_shell_command,
|
from patch import launch_server_cmd
|
||||||
wait_for_server,
|
else:
|
||||||
terminate_process,
|
from sglang.utils import launch_server_cmd
|
||||||
print_highlight,
|
|
||||||
|
server_process, port = launch_server_cmd(
|
||||||
|
"""
|
||||||
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||||
|
--host 0.0.0.0
|
||||||
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
server_process = execute_shell_command(
|
wait_for_server(f"http://localhost:{port}")
|
||||||
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0"
|
|
||||||
)
|
|
||||||
|
|
||||||
wait_for_server("http://localhost:30000")
|
|
||||||
|
|
||||||
# Terminate Sever
|
|
||||||
|
|
||||||
|
# Terminate Server
|
||||||
terminate_process(server_process)
|
terminate_process(server_process)
|
||||||
```
|
```
|
||||||
2. Launch Engine and close Engine
|
|
||||||
|
**To launch and kill the engine:**
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Launch Engine
|
# Launch Engine
|
||||||
|
|
||||||
import sglang as sgl
|
import sglang as sgl
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from sglang.test.test_utils import is_in_ci
|
||||||
|
|
||||||
|
if is_in_ci():
|
||||||
|
import patch
|
||||||
|
|
||||||
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
||||||
|
|
||||||
# Terminalte Engine
|
# Terminalte Engine
|
||||||
llm.shutdown()
|
llm.shutdown()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### **Why this approach?**
|
||||||
|
|
||||||
|
- **Dynamic Port Allocation**: Avoids port conflicts by selecting an available port at runtime, enabling multiple server instances to run in parallel.
|
||||||
|
- **Optimized for CI**: The `patch` version of `launch_server_cmd` and `sgl.Engine()` in CI environments helps manage GPU memory dynamically, preventing conflicts and improving test parallelism.
|
||||||
|
- **Better Parallel Execution**: Ensures smooth concurrent tests by avoiding fixed port collisions and optimizing memory usage.
|
||||||
|
|
||||||
|
### **Model Selection**
|
||||||
|
|
||||||
|
For demonstrations in the docs, **prefer smaller models** to reduce memory consumption and speed up inference. Running larger models in CI can lead to instability due to memory constraints.
|
||||||
|
|
||||||
|
### **Prompt Alignment Example**
|
||||||
|
|
||||||
|
When designing prompts, ensure they align with SGLang’s structured formatting. For example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
prompt = """You are an AI assistant. Answer concisely and accurately.
|
||||||
|
|
||||||
|
User: What is the capital of France?
|
||||||
|
Assistant: The capital of France is Paris."""
|
||||||
|
```
|
||||||
|
|
||||||
|
This keeps responses aligned with expected behavior and improves reliability across different files.
|
||||||
|
|||||||
@@ -405,7 +405,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(server_process, port)"
|
"terminate_process(server_process)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -252,7 +252,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(server_process, port)\n",
|
"terminate_process(server_process)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embedding_process, port = launch_server_cmd(\n",
|
"embedding_process, port = launch_server_cmd(\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
@@ -286,7 +286,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(embedding_process, port)"
|
"terminate_process(embedding_process)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -304,7 +304,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(embedding_process, port)\n",
|
"terminate_process(embedding_process)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
|
"# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
|
||||||
"# This will be updated in the future.\n",
|
"# This will be updated in the future.\n",
|
||||||
@@ -355,7 +355,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(reward_process, port)"
|
"terminate_process(reward_process)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -425,7 +425,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(tokenizer_free_server_process, port)"
|
"terminate_process(tokenizer_free_server_process)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -512,7 +512,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(server_process, port)"
|
"terminate_process(server_process)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -169,7 +169,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(embedding_process, port)"
|
"terminate_process(embedding_process)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -243,7 +243,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(embedding_process, port)"
|
"terminate_process(embedding_process)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import weakref
|
||||||
|
|
||||||
from sglang.utils import execute_shell_command, reserve_port
|
from sglang.utils import execute_shell_command, reserve_port
|
||||||
|
|
||||||
@@ -21,15 +22,29 @@ def patched_post_init(self):
|
|||||||
|
|
||||||
server_args_mod.ServerArgs.__post_init__ = patched_post_init
|
server_args_mod.ServerArgs.__post_init__ = patched_post_init
|
||||||
|
|
||||||
|
process_socket_map = weakref.WeakKeyDictionary()
|
||||||
|
|
||||||
|
|
||||||
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
|
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
|
||||||
|
"""
|
||||||
|
Launch the server using the given command.
|
||||||
|
If no port is specified, a free port is reserved.
|
||||||
|
"""
|
||||||
if port is None:
|
if port is None:
|
||||||
port = reserve_port()
|
port, lock_socket = reserve_port(host)
|
||||||
|
else:
|
||||||
|
lock_socket = None
|
||||||
|
|
||||||
extra_flags = (
|
extra_flags = (
|
||||||
f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
|
f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
|
||||||
f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
|
f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
|
||||||
f"--disable-cuda-graph"
|
f"--disable-cuda-graph"
|
||||||
)
|
)
|
||||||
|
|
||||||
full_command = f"{command} --port {port} {extra_flags}"
|
full_command = f"{command} --port {port} {extra_flags}"
|
||||||
process = execute_shell_command(full_command)
|
process = execute_shell_command(full_command)
|
||||||
|
|
||||||
|
if lock_socket is not None:
|
||||||
|
process_socket_map[process] = lock_socket
|
||||||
|
|
||||||
return process, port
|
return process, port
|
||||||
|
|||||||
@@ -243,15 +243,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(server_process, port)"
|
"terminate_process(server_process)"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|||||||
@@ -397,7 +397,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"terminate_process(server_process, port)"
|
"terminate_process(server_process)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
SGLang Documentation
|
SGLang Documentation
|
||||||
====================================
|
====================
|
||||||
|
|
||||||
SGLang is a fast serving framework for large language models and vision language models.
|
SGLang is a fast serving framework for large language models and vision language models.
|
||||||
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
||||||
@@ -10,7 +10,6 @@ The core features include:
|
|||||||
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
||||||
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: Getting Started
|
:caption: Getting Started
|
||||||
@@ -39,7 +38,6 @@ The core features include:
|
|||||||
frontend/frontend.md
|
frontend/frontend.md
|
||||||
frontend/choices_methods.md
|
frontend/choices_methods.md
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: SGLang Router
|
:caption: SGLang Router
|
||||||
@@ -47,24 +45,47 @@ The core features include:
|
|||||||
router/router.md
|
router/router.md
|
||||||
|
|
||||||
|
|
||||||
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
General
|
||||||
|
---------------------
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: References
|
|
||||||
|
|
||||||
references/supported_models.md
|
references/supported_models.md
|
||||||
|
references/contribution_guide.md
|
||||||
|
references/troubleshooting.md
|
||||||
|
references/faq.md
|
||||||
|
references/learn_more.md
|
||||||
|
|
||||||
|
Hardware
|
||||||
|
--------------------------
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
references/AMD.md
|
||||||
|
references/amd_configure.md
|
||||||
|
references/nvidia_jetson.md
|
||||||
|
|
||||||
|
Advanced Models & Deployment
|
||||||
|
------------------------------
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
references/deepseek.md
|
||||||
|
references/multi_node.md
|
||||||
|
references/multi_node_inference_k8s_lws.md
|
||||||
|
references/modelscope.md
|
||||||
|
|
||||||
|
Performance & Tuning
|
||||||
|
--------------------
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
references/sampling_params.md
|
references/sampling_params.md
|
||||||
references/hyperparameter_tuning.md
|
references/hyperparameter_tuning.md
|
||||||
references/benchmark_and_profiling.md
|
references/benchmark_and_profiling.md
|
||||||
references/accuracy_evaluation.md
|
references/accuracy_evaluation.md
|
||||||
references/custom_chat_template.md
|
references/custom_chat_template.md
|
||||||
references/amd_configure.md
|
|
||||||
references/deepseek.md
|
|
||||||
references/multi_node.md
|
|
||||||
references/multi_node_inference_k8s_lws.md
|
|
||||||
references/modelscope.md
|
|
||||||
references/quantization.md
|
references/quantization.md
|
||||||
references/contribution_guide.md
|
|
||||||
references/troubleshooting.md
|
|
||||||
references/nvidia_jetson.md
|
|
||||||
references/faq.md
|
|
||||||
references/learn_more.md
|
|
||||||
|
|||||||
@@ -2,19 +2,27 @@
|
|||||||
|
|
||||||
You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems.
|
You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems.
|
||||||
|
|
||||||
## Method 1: With pip
|
## Method 1: With pip or uv
|
||||||
```
|
|
||||||
|
We recommend using uv to install the dependencies with a higher installation speed:
|
||||||
|
|
||||||
|
```bash
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install sgl-kernel --force-reinstall --no-deps
|
pip install uv
|
||||||
pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
|
uv pip install sgl-kernel --force-reinstall --no-deps
|
||||||
|
uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
|
**Quick Fix to Installation**
|
||||||
|
|
||||||
If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
|
- SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
|
||||||
|
|
||||||
- Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
|
- If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
|
||||||
- Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
|
|
||||||
|
1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
|
||||||
|
2. Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
|
||||||
|
|
||||||
|
- If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.
|
||||||
|
|
||||||
## Method 2: From source
|
## Method 2: From source
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -5,12 +5,15 @@ import importlib
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import signal
|
import signal
|
||||||
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
import weakref
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from json import dumps
|
from json import dumps
|
||||||
@@ -21,6 +24,8 @@ import requests
|
|||||||
from IPython.display import HTML, display
|
from IPython.display import HTML, display
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from sglang.srt.utils import kill_process_tree
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -306,27 +311,12 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
|
|||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
import fcntl
|
|
||||||
|
|
||||||
|
|
||||||
def is_in_ci():
|
def is_in_ci():
|
||||||
from sglang.test.test_utils import is_in_ci
|
from sglang.test.test_utils import is_in_ci
|
||||||
|
|
||||||
return is_in_ci()
|
return is_in_ci()
|
||||||
|
|
||||||
|
|
||||||
LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
|
|
||||||
PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
|
|
||||||
|
|
||||||
if not os.path.exists(LOCKFILE):
|
|
||||||
with open(LOCKFILE, "w") as f:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not os.path.exists(PORT_REGISTRY):
|
|
||||||
with open(PORT_REGISTRY, "w") as f:
|
|
||||||
json.dump([], f)
|
|
||||||
|
|
||||||
|
|
||||||
def print_highlight(html_content: str):
|
def print_highlight(html_content: str):
|
||||||
if is_in_ci():
|
if is_in_ci():
|
||||||
html_content = str(html_content).replace("\n", "<br>")
|
html_content = str(html_content).replace("\n", "<br>")
|
||||||
@@ -335,55 +325,44 @@ def print_highlight(html_content: str):
|
|||||||
print(html_content)
|
print(html_content)
|
||||||
|
|
||||||
|
|
||||||
def init_port_registry():
|
process_socket_map = weakref.WeakKeyDictionary()
|
||||||
"""Initialize the port registry file if it doesn't exist."""
|
|
||||||
if not os.path.exists(PORT_REGISTRY):
|
|
||||||
with open(PORT_REGISTRY, "w") as f:
|
|
||||||
json.dump([], f)
|
|
||||||
|
|
||||||
|
|
||||||
def reserve_port(start=30000, end=40000):
|
def reserve_port(host, start=30000, end=40000):
|
||||||
"""
|
"""
|
||||||
Reserve an available port using a file lock and a registry.
|
Reserve an available port by trying to bind a socket.
|
||||||
Returns the allocated port.
|
Returns a tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
|
||||||
"""
|
"""
|
||||||
init_port_registry()
|
candidates = list(range(start, end))
|
||||||
with open(LOCKFILE, "w") as lock:
|
random.shuffle(candidates)
|
||||||
fcntl.flock(lock, fcntl.LOCK_EX)
|
|
||||||
|
for port in candidates:
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||||
try:
|
try:
|
||||||
with open(PORT_REGISTRY, "r") as f:
|
# Attempt to bind to the port on localhost
|
||||||
used = json.load(f)
|
sock.bind((host, port))
|
||||||
except Exception:
|
return port, sock
|
||||||
used = []
|
except socket.error:
|
||||||
for port in range(start, end):
|
sock.close() # Failed to bind, try next port
|
||||||
if port not in used:
|
continue
|
||||||
used.append(port)
|
raise RuntimeError("No free port available.")
|
||||||
with open(PORT_REGISTRY, "w") as f:
|
|
||||||
json.dump(used, f)
|
|
||||||
return port
|
|
||||||
raise RuntimeError("No free port available")
|
|
||||||
|
|
||||||
|
|
||||||
def release_port(port):
|
def release_port(lock_socket):
|
||||||
"""Release the reserved port by removing it from the registry."""
|
"""
|
||||||
with open(LOCKFILE, "w") as lock:
|
Release the reserved port by closing the lock socket.
|
||||||
fcntl.flock(lock, fcntl.LOCK_EX)
|
"""
|
||||||
try:
|
try:
|
||||||
with open(PORT_REGISTRY, "r") as f:
|
lock_socket.close()
|
||||||
used = json.load(f)
|
except Exception as e:
|
||||||
except Exception:
|
print(f"Error closing socket: {e}")
|
||||||
used = []
|
|
||||||
if port in used:
|
|
||||||
used.remove(port)
|
|
||||||
with open(PORT_REGISTRY, "w") as f:
|
|
||||||
json.dump(used, f)
|
|
||||||
|
|
||||||
|
|
||||||
def execute_shell_command(command: str) -> subprocess.Popen:
|
def execute_shell_command(command: str) -> subprocess.Popen:
|
||||||
"""
|
"""
|
||||||
Execute a shell command and return its process handle.
|
Execute a shell command and return its process handle.
|
||||||
"""
|
"""
|
||||||
# Replace newline continuations and split the command string.
|
|
||||||
command = command.replace("\\\n", " ").replace("\\", " ")
|
command = command.replace("\\\n", " ").replace("\\", " ")
|
||||||
parts = command.split()
|
parts = command.split()
|
||||||
return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
|
return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
|
||||||
@@ -395,21 +374,28 @@ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
|
|||||||
If no port is specified, a free port is reserved.
|
If no port is specified, a free port is reserved.
|
||||||
"""
|
"""
|
||||||
if port is None:
|
if port is None:
|
||||||
port = reserve_port()
|
port, lock_socket = reserve_port(host)
|
||||||
|
else:
|
||||||
|
lock_socket = None
|
||||||
|
|
||||||
full_command = f"{command} --port {port}"
|
full_command = f"{command} --port {port}"
|
||||||
process = execute_shell_command(full_command)
|
process = execute_shell_command(full_command)
|
||||||
|
|
||||||
|
if lock_socket is not None:
|
||||||
|
process_socket_map[process] = lock_socket
|
||||||
|
|
||||||
return process, port
|
return process, port
|
||||||
|
|
||||||
|
|
||||||
def terminate_process(process, port=None):
|
def terminate_process(process):
|
||||||
"""
|
"""
|
||||||
Terminate the process and, if a port was reserved, release it.
|
Terminate the process and automatically release the reserved port.
|
||||||
"""
|
"""
|
||||||
from sglang.srt.utils import kill_process_tree
|
|
||||||
|
|
||||||
kill_process_tree(process.pid)
|
kill_process_tree(process.pid)
|
||||||
if port is not None:
|
|
||||||
release_port(port)
|
lock_socket = process_socket_map.pop(process, None)
|
||||||
|
if lock_socket is not None:
|
||||||
|
release_port(lock_socket)
|
||||||
|
|
||||||
|
|
||||||
def wait_for_server(base_url: str, timeout: int = None) -> None:
|
def wait_for_server(base_url: str, timeout: int = None) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user