sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct
This commit is contained in:
58
docs/Makefile
Normal file
58
docs/Makefile
Normal file
@@ -0,0 +1,58 @@
|
||||
# Minimal Makefile for Sphinx documentation
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SPHINXAUTOBUILD ?= sphinx-autobuild
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
PORT ?= 8003
|
||||
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@echo ""
|
||||
@echo "Additional targets:"
|
||||
@echo " serve to build and serve documentation with auto-build and live reload"
|
||||
|
||||
# Compile Notebook files and record execution time
|
||||
compile:
|
||||
@set -e; \
|
||||
echo "Starting Notebook compilation..."; \
|
||||
mkdir -p logs; \
|
||||
echo "Notebook execution timings:" > logs/timing.log; \
|
||||
START_TOTAL=$$(date +%s); \
|
||||
find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print0 | \
|
||||
parallel -0 -j3 --halt soon,fail=1 ' \
|
||||
NB_NAME=$$(basename {}); \
|
||||
START_TIME=$$(date +%s); \
|
||||
retry --delay=0 --times=2 -- \
|
||||
jupyter nbconvert --to notebook --execute --inplace "{}" \
|
||||
--ExecutePreprocessor.timeout=600 \
|
||||
--ExecutePreprocessor.kernel_name=python3; \
|
||||
RET_CODE=$$?; \
|
||||
END_TIME=$$(date +%s); \
|
||||
ELAPSED_TIME=$$((END_TIME - START_TIME)); \
|
||||
echo "$${NB_NAME}: $${ELAPSED_TIME}s" >> logs/timing.log; \
|
||||
exit $$RET_CODE' || exit 1; \
|
||||
END_TOTAL=$$(date +%s); \
|
||||
TOTAL_ELAPSED=$$((END_TOTAL - START_TOTAL)); \
|
||||
echo "---------------------------------" >> logs/timing.log; \
|
||||
echo "Total execution time: $${TOTAL_ELAPSED}s" >> logs/timing.log; \
|
||||
echo "All Notebook execution timings:" && cat logs/timing.log
|
||||
|
||||
# Serve documentation with auto-build and live reload
|
||||
serve:
|
||||
@echo "Starting auto-build server at http://0.0.0.0:$(PORT)"
|
||||
@$(SPHINXAUTOBUILD) "$(SOURCEDIR)" "$(BUILDDIR)/html" \
|
||||
--host 0.0.0.0 \
|
||||
--port $(PORT) \
|
||||
--watch $(SOURCEDIR) \
|
||||
--re-ignore ".*\.(ipynb_checkpoints|pyc|pyo|pyd|git)"
|
||||
|
||||
.PHONY: help Makefile compile clean serve
|
||||
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
clean:
|
||||
find . -name "*.ipynb" -exec nbstripout {} \;
|
||||
rm -rf $(BUILDDIR)
|
||||
rm -rf logs
|
||||
55
docs/README.md
Normal file
55
docs/README.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# SGLang Documentation
|
||||
|
||||
We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase.
|
||||
Most documentation files are located under the `docs/` folder.
|
||||
|
||||
## Docs Workflow
|
||||
|
||||
### Install Dependency
|
||||
|
||||
```bash
|
||||
apt-get update && apt-get install -y pandoc parallel retry
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Update Documentation
|
||||
|
||||
Update your Jupyter notebooks in the appropriate subdirectories under `docs/`. If you add new files, remember to update `index.rst` (or relevant `.rst` files) accordingly.
|
||||
|
||||
- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request.
|
||||
|
||||
```bash
|
||||
# 1) Compile all Jupyter notebooks
|
||||
make compile # This step can take a long time (10+ mins). You can consider skipping this step if you can make sure your added files are correct.
|
||||
make html
|
||||
|
||||
# 2) Compile and Preview documentation locally with auto-build
|
||||
# This will automatically rebuild docs when files change
|
||||
# Open your browser at the displayed port to view the docs
|
||||
bash serve.sh
|
||||
|
||||
# 2a) Alternative ways to serve documentation
|
||||
# Directly use make serve
|
||||
make serve
|
||||
# With custom port
|
||||
PORT=8080 make serve
|
||||
|
||||
# 3) Clean notebook outputs
|
||||
# nbstripout removes notebook outputs so your PR stays clean
|
||||
pip install nbstripout
|
||||
find . -name '*.ipynb' -exec nbstripout {} \;
|
||||
|
||||
# 4) Pre-commit checks and create a PR
|
||||
# After these checks pass, push your changes and open a PR on your branch
|
||||
pre-commit run --all-files
|
||||
```
|
||||
---
|
||||
|
||||
## Documentation Style Guidelines
|
||||
|
||||
- For common functionalities, we prefer **Jupyter Notebooks** over Markdown so that all examples can be executed and validated by our docs CI pipeline. For complex features (e.g., distributed serving), Markdown is preferred.
|
||||
- Keep in mind the documentation execution time when writing interactive Jupyter notebooks. Each interactive notebook will be run and compiled against every commit to ensure they are runnable, so it is important to apply some tips to reduce the documentation compilation time:
|
||||
- Use small models (e.g., `qwen/qwen2.5-0.5b-instruct`) for most cases to reduce server launch time.
|
||||
- Reuse the launched server as much as possible to reduce server launch time.
|
||||
- Do not use absolute links (e.g., `https://docs.sglang.ai/get_started/install.html`). Always prefer relative links (e.g., `../get_started/install.md`).
|
||||
- Follow the existing examples to learn how to launch a server, send a query and other common styles.
|
||||
29
docs/_static/css/custom_log.css
vendored
Normal file
29
docs/_static/css/custom_log.css
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
.output_area {
|
||||
color: #615656;
|
||||
}
|
||||
|
||||
table.autosummary td {
|
||||
width: 50%
|
||||
}
|
||||
|
||||
img.align-center {
|
||||
display: block;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
}
|
||||
|
||||
.output_area.stderr {
|
||||
color: #d3d3d3 !important;
|
||||
}
|
||||
|
||||
.output_area.stdout {
|
||||
color: #d3d3d3 !important;
|
||||
}
|
||||
|
||||
div.output_area.stderr {
|
||||
color: #d3d3d3 !important;
|
||||
}
|
||||
|
||||
div.output_area.stdout {
|
||||
color: #d3d3d3 !important;
|
||||
}
|
||||
9
docs/_static/css/readthedocs.css
vendored
Normal file
9
docs/_static/css/readthedocs.css
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
table.autosummary td {
|
||||
width: 50%
|
||||
}
|
||||
|
||||
img.align-center {
|
||||
display: block;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
}
|
||||
BIN
docs/_static/image/logo.ico
vendored
Normal file
BIN
docs/_static/image/logo.ico
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 37 KiB |
BIN
docs/_static/image/logo.png
vendored
Normal file
BIN
docs/_static/image/logo.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 393 KiB |
104
docs/advanced_features/attention_backend.md
Normal file
104
docs/advanced_features/attention_backend.md
Normal file
@@ -0,0 +1,104 @@
|
||||
# Attention Backend
|
||||
|
||||
SGLang supports multiple attention backends. Each of them has different pros and cons.
|
||||
You can test them according to your needs.
|
||||
|
||||
## Supporting matrix for different attention backends
|
||||
|
||||
| **Backend** | **Page Size > 1** | **Spec Decoding** | **MLA** | **Sliding Window** | **MultiModal** |
|
||||
|--------------------------|-------------------|-------------------|---------|--------------------|----------------|
|
||||
| **FlashInfer** | ❌ | ✅ | ✅ | ✅ | ✅ |
|
||||
| **FA3** | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| **Triton** | ❌ | ✅ | ✅ | ✅ | ❌ |
|
||||
| **Torch Native** | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| **FlashMLA** | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| **TRTLLM MLA** | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| **Ascend** | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| **Wave** | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
|
||||
**Notes:**
|
||||
- TRTLLM MLA only implements decode operations. For prefill operations (including multimodal inputs), it falls back to FlashInfer MLA backend.
|
||||
|
||||
Note: Every kernel backend is compatible with a page size > 1 by specifying an argument such as `--page-size 16`.
|
||||
This is because a page size of 16 can be converted to a page size of 1 in the kernel backend.
|
||||
The "❌" and "✅" symbols in the table above under "Page Size > 1" indicate whether the kernel actually operates with a page size greater than 1, rather than treating a page size of 16 as a page size of 1.
|
||||
|
||||
## User guide
|
||||
|
||||
### Launch command for different attention backends.
|
||||
|
||||
- FlashInfer (Default for Non-Hopper Machines, e.g., A100, A40)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend flashinfer
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend flashinfer --trust-remote-code
|
||||
```
|
||||
|
||||
- FlashAttention 3 (Default for Hopper Machines, e.g., H100, H200, H20)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend fa3
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --attention-backend fa3
|
||||
```
|
||||
|
||||
- Triton
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend triton
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend triton --trust-remote-code
|
||||
```
|
||||
|
||||
- Torch Native
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend torch_native
|
||||
```
|
||||
|
||||
- FlashMLA
|
||||
```bash
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend flashmla --trust-remote-code
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend flashmla --kv-cache-dtype fp8_e4m3 --trust-remote-code
|
||||
```
|
||||
|
||||
- TRTLLM MLA (Optimized for Blackwell Architecture, e.g., B200)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend trtllm_mla --trust-remote-code
|
||||
```
|
||||
|
||||
- TRTLLM MLA with FP8 KV Cache (Higher concurrency, lower memory footprint)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend trtllm_mla --kv-cache-dtype fp8_e4m3 --trust-remote-code
|
||||
```
|
||||
|
||||
- Ascend
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend ascend
|
||||
```
|
||||
|
||||
- Wave
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend wave
|
||||
```
|
||||
|
||||
## Steps to add a new attention backend
|
||||
To add a new attention backend, you can learn from the existing backends
|
||||
(`python/sglang/srt/layers/attention/triton_backend.py`, `python/sglang/srt/layers/attention/flashattention_backend.py`)
|
||||
and follow the steps below.
|
||||
|
||||
1. Run without cuda graph. Support the two forward functions
|
||||
- forward_extend
|
||||
- Will be used for prefill, prefill with KV cache, and target verification
|
||||
- It will be called once per layer
|
||||
- forward_decode
|
||||
- Will be used for normal decode, and draft decode
|
||||
- It will be called once per layer
|
||||
- init_forward_metadata
|
||||
- Initialize the class and common metadata shared by all layers
|
||||
- Call the plan function for optimizations like split_kv
|
||||
- It will be called once per forward
|
||||
2. Run with cuda graph. It has two phases (capture and replay) and you need to implement three functions
|
||||
- init_cuda_graph_state
|
||||
- It will be called once during life time
|
||||
- Create all common shared buffers
|
||||
- init_forward_metadata_capture_cuda_graph
|
||||
- It will be called before capturing a cuda graph
|
||||
- It is similar to init_forward_metadata but write the medatada to some pre-defined buffers
|
||||
- init_forward_metadata_replay_cuda_graph
|
||||
- It will be called before replaying a cuda graph
|
||||
- This function is in the critical path and needs to be fast
|
||||
77
docs/advanced_features/hyperparameter_tuning.md
Normal file
77
docs/advanced_features/hyperparameter_tuning.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Hyperparameter Tuning
|
||||
|
||||
## Achieving high throughput for offline batch inference
|
||||
|
||||
Achieving a large batch size is the most important thing for attaining high throughput in offline batch inference.
|
||||
When the server is running at full load in a steady state, look for the following in the log:
|
||||
|
||||
```Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, cuda graph: True, gen throughput (token/s): 4594.01, #queue-req: 317```
|
||||
|
||||
### Adjust the request submission speed to control `#queue-req`
|
||||
|
||||
`#queue-req` indicates the number of requests in the queue.
|
||||
If you frequently see `#queue-req: 0`, it suggests that your client code is submitting requests too slowly.
|
||||
A healthy range for `#queue-req` is `100 - 2000`.
|
||||
However, avoid making `#queue-req` too large, as this will increase the scheduling overhead on the server.
|
||||
|
||||
### Achieve a high `token usage`
|
||||
|
||||
`token usage` indicates the KV cache memory utilization of the server. `token usage > 0.9` means good utilization.
|
||||
|
||||
If you frequently see `token usage < 0.9` and `#queue-req > 0`, it means the server is too conservative about taking in new requests. You can decrease `--schedule-conservativeness` to a value like 0.3.
|
||||
The case of a server being too conservative can happen when users send many requests with a large `max_new_tokens` but the requests stop very early due to EOS or stop strings.
|
||||
|
||||
On the other hand, if you see `token usage` very high and you frequently see warnings like
|
||||
`KV cache pool is full. Retract requests. #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3.
|
||||
If you see `KV cache pool is full. Retract requests.` occasionally but not frequently, it is okay.
|
||||
|
||||
### Tune `--mem-fraction-static` to increase KV cache pool capacity
|
||||
SGLang allocates memory as follows:
|
||||
|
||||
Total memory usage = model weights + KV cache pool + CUDA graph buffers + activations
|
||||
|
||||
The `--mem-fraction-static` parameter determines how much memory is allocated to the first two components:
|
||||
|
||||
mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity
|
||||
|
||||
To support higher concurrency, you should maximize the KV cache pool capacity by setting `--mem-fraction-static` as high as possible while still reserving enough memory for activations and CUDA graph buffers.
|
||||
|
||||
SGLang uses simple heuristics to set the default value of `--mem-fraction-static`, but you can optimize it for your use cases.
|
||||
As a rule of thumb, reserving 5–8 GB of memory for activations is typically sufficient. You can check this by inspecting the logs just before the server is ready.
|
||||
Look for log entries like this:
|
||||
|
||||
```
|
||||
[2025-08-11 17:17:03] max_total_num_tokens=665690, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=4096, context_len=65536, available_gpu_mem=13.50 GB
|
||||
```
|
||||
|
||||
Check the `available_gpu_mem` value.
|
||||
- If it is between 5–8 GB, the setting is good.
|
||||
- If it is too high (e.g., 10 - 20 GB), increase `--mem-fraction-static` to allocate more memory to the KV cache.
|
||||
- If it is too low, you risk out-of-memory (OOM) errors later, so decrease `--mem-fraction-static`.
|
||||
|
||||
Another straightforward approach is to increase `--mem-fraction-static` in increments of 0.01 until you encounter OOM errors for your workloads.
|
||||
|
||||
### Avoid out-of-memory errors by tuning `--chunked-prefill-size`, `--mem-fraction-static`, and `--max-running-requests`
|
||||
|
||||
If you encounter out-of-memory (OOM) errors, you can adjust the following parameters:
|
||||
|
||||
- If OOM occurs during prefill, try reducing `--chunked-prefill-size` to `4096` or `2048`. This saves memory but slows down the prefill speed for long prompts.
|
||||
- If OOM occurs during decoding, try lowering `--max-running-requests`.
|
||||
- You can also reduce `--mem-fraction-static` to a smaller value, such as 0.8 or 0.7. This decreases the memory usage of the KV cache memory pool and helps prevent OOM errors during both prefill and decoding. However, it limits maximum concurrency and reduces peak throughput.
|
||||
|
||||
### Tune `--cuda-graph-max-bs`
|
||||
By default, CUDA graph is enabled only for small batch sizes (e.g., less than 160 or 256).
|
||||
However, for some models, especially at large tensor parallelism sizes, CUDA graph can be useful for batch sizes up to 512 or 768.
|
||||
Therefore, it may be beneficial to increase `--cuda-graph-max-bs` to a larger value.
|
||||
Note that CUDA graph consumes more memory, so you may need to reduce `--mem-fraction-static` at the same time.
|
||||
|
||||
### Tune `--dp-size` and `--tp-size`
|
||||
|
||||
Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput. Refer to [sglang router](../advanced_features/router.md) for a better data parallelism rather than using `dp_size` parameter.
|
||||
|
||||
### Try other options
|
||||
|
||||
- `torch.compile` accelerates small models on small batch sizes. You can enable it with `--enable-torch-compile`.
|
||||
- Try other quantization (e.g. FP8 quantization with `--quantization fp8`)
|
||||
- Try other parallelism strategies (e.g. [expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/)) or DP attention for deepseek models (with `--enable-dp-attention --dp-size 8`).
|
||||
- If the workload has many shared prefixes, try `--schedule-policy lpm`. Here, `lpm` stands for longest prefix match. It reorders requests to encourage more cache hits but introduces more scheduling overhead.
|
||||
529
docs/advanced_features/lora.ipynb
Normal file
529
docs/advanced_features/lora.ipynb
Normal file
@@ -0,0 +1,529 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LoRA Serving"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"SGLang enables the use of [LoRA adapters](https://arxiv.org/abs/2106.09685) with a base model. By incorporating techniques from [S-LoRA](https://arxiv.org/pdf/2311.03285) and [Punica](https://arxiv.org/pdf/2310.18547), SGLang can efficiently support multiple LoRA adapters for different sequences within a single batch of inputs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Arguments for LoRA Serving"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following server arguments are relevant for multi-LoRA serving:\n",
|
||||
"\n",
|
||||
"* `enable_lora`: Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.\n",
|
||||
"\n",
|
||||
"* `lora_paths`: The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {\"lora_name\":str,\"lora_path\":str,\"pinned\":bool}.\n",
|
||||
"\n",
|
||||
"* `max_loras_per_batch`: Maximum number of adaptors used by each batch. This argument can affect the amount of GPU memory reserved for multi-LoRA serving, so it should be set to a smaller value when memory is scarce. Defaults to be 8.\n",
|
||||
"\n",
|
||||
"* `max_loaded_loras`: If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `max-loras-per-batch`.\n",
|
||||
"\n",
|
||||
"* `lora_backend`: The backend of running GEMM kernels for Lora modules. Currently we only support Triton LoRA backend. In the future, faster backend built upon Cutlass or Cuda kernels will be added.\n",
|
||||
"\n",
|
||||
"* `max_lora_rank`: The maximum LoRA rank that should be supported. If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of larger LoRA rank after server startup.\n",
|
||||
"\n",
|
||||
"* `lora_target_modules`: The union set of all target modules where LoRA should be applied (e.g., `q_proj`, `k_proj`, `gate_proj`). If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of different target modules after server startup. You can also set it to `all` to enable LoRA for all supported modules. However, enabling LoRA on additional modules introduces a minor performance overhead. If your application is performance-sensitive, we recommend only specifying the modules for which you plan to load adapters.\n",
|
||||
"\n",
|
||||
"* `tp_size`: LoRA serving along with Tensor Parallelism is supported by SGLang. `tp_size` controls the number of GPUs for tensor parallelism. More details on the tensor sharding strategy can be found in [S-Lora](https://arxiv.org/pdf/2311.03285) paper.\n",
|
||||
"\n",
|
||||
"From client side, the user needs to provide a list of strings as input batch, and a list of adaptor names that each input sequence corresponds to."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage\n",
|
||||
"\n",
|
||||
"### Serving Single Adaptor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, terminate_process"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
" --enable-lora \\\n",
|
||||
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
|
||||
" --max-loras-per-batch 1 --lora-backend triton \\\n",
|
||||
" --log-level warning \\\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://127.0.0.1:{port}\"\n",
|
||||
"json_data = {\n",
|
||||
" \"text\": [\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" ],\n",
|
||||
" \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
|
||||
" # The first input uses lora0, and the second input uses the base model\n",
|
||||
" \"lora_path\": [\"lora0\", None],\n",
|
||||
"}\n",
|
||||
"response = requests.post(\n",
|
||||
" url + \"/generate\",\n",
|
||||
" json=json_data,\n",
|
||||
")\n",
|
||||
"print(f\"Output 0: {response.json()[0]['text']}\")\n",
|
||||
"print(f\"Output 1: {response.json()[1]['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Serving Multiple Adaptors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
" --enable-lora \\\n",
|
||||
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
|
||||
" lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n",
|
||||
" --max-loras-per-batch 2 --lora-backend triton \\\n",
|
||||
" --log-level warning \\\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://127.0.0.1:{port}\"\n",
|
||||
"json_data = {\n",
|
||||
" \"text\": [\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" ],\n",
|
||||
" \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
|
||||
" # The first input uses lora0, and the second input uses lora1\n",
|
||||
" \"lora_path\": [\"lora0\", \"lora1\"],\n",
|
||||
"}\n",
|
||||
"response = requests.post(\n",
|
||||
" url + \"/generate\",\n",
|
||||
" json=json_data,\n",
|
||||
")\n",
|
||||
"print(f\"Output 0: {response.json()[0]['text']}\")\n",
|
||||
"print(f\"Output 1: {response.json()[1]['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Dynamic LoRA loading"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Instead of specifying all adapters during server startup via `--lora-paths`. You can also load & unload LoRA adapters dynamically via the `/load_lora_adapter` and `/unload_lora_adapter` API.\n",
|
||||
"\n",
|
||||
"When using dynamic LoRA loading, it's recommended to explicitly specify both `--max-lora-rank` and `--lora-target-modules` at startup. For backward compatibility, SGLang will infer these values from `--lora-paths` if they are not explicitly provided. However, in that case, you would have to ensure that all dynamically loaded adapters share the same shape (rank and target modules) as those in the initial `--lora-paths` or are strictly \"smaller\"."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"lora0 = \"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\" # rank - 4, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj\n",
|
||||
"lora1 = \"algoprog/fact-generation-llama-3.1-8b-instruct-lora\" # rank - 64, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj\n",
|
||||
"lora0_new = \"philschmid/code-llama-3-1-8b-text-to-sql-lora\" # rank - 256, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The `--target-lora-modules` param below is technically not needed, as the server will infer it from lora0 which already has all the target modules specified.\n",
|
||||
"# We are adding it here just to demonstrate usage.\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
" python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
" --enable-lora \\\n",
|
||||
" --cuda-graph-max-bs 2 \\\n",
|
||||
" --max-loras-per-batch 2 --lora-backend triton \\\n",
|
||||
" --max-lora-rank 256\n",
|
||||
" --lora-target-modules all\n",
|
||||
" --log-level warning\n",
|
||||
" \"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"url = f\"http://127.0.0.1:{port}\"\n",
|
||||
"wait_for_server(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load adapter lora0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(\n",
|
||||
" url + \"/load_lora_adapter\",\n",
|
||||
" json={\n",
|
||||
" \"lora_name\": \"lora0\",\n",
|
||||
" \"lora_path\": lora0,\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" print(\"LoRA adapter loaded successfully.\", response.json())\n",
|
||||
"else:\n",
|
||||
" print(\"Failed to load LoRA adapter.\", response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load adapter lora1:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(\n",
|
||||
" url + \"/load_lora_adapter\",\n",
|
||||
" json={\n",
|
||||
" \"lora_name\": \"lora1\",\n",
|
||||
" \"lora_path\": lora1,\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" print(\"LoRA adapter loaded successfully.\", response.json())\n",
|
||||
"else:\n",
|
||||
" print(\"Failed to load LoRA adapter.\", response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check inference output:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://127.0.0.1:{port}\"\n",
|
||||
"json_data = {\n",
|
||||
" \"text\": [\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" ],\n",
|
||||
" \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
|
||||
" # The first input uses lora0, and the second input uses lora1\n",
|
||||
" \"lora_path\": [\"lora0\", \"lora1\"],\n",
|
||||
"}\n",
|
||||
"response = requests.post(\n",
|
||||
" url + \"/generate\",\n",
|
||||
" json=json_data,\n",
|
||||
")\n",
|
||||
"print(f\"Output from lora0: \\n{response.json()[0]['text']}\\n\")\n",
|
||||
"print(f\"Output from lora1 (updated): \\n{response.json()[1]['text']}\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Unload lora0 and replace it with a different adapter:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(\n",
|
||||
" url + \"/unload_lora_adapter\",\n",
|
||||
" json={\n",
|
||||
" \"lora_name\": \"lora0\",\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = requests.post(\n",
|
||||
" url + \"/load_lora_adapter\",\n",
|
||||
" json={\n",
|
||||
" \"lora_name\": \"lora0\",\n",
|
||||
" \"lora_path\": lora0_new,\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" print(\"LoRA adapter loaded successfully.\", response.json())\n",
|
||||
"else:\n",
|
||||
" print(\"Failed to load LoRA adapter.\", response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check output again:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://127.0.0.1:{port}\"\n",
|
||||
"json_data = {\n",
|
||||
" \"text\": [\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" ],\n",
|
||||
" \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
|
||||
" # The first input uses lora0, and the second input uses lora1\n",
|
||||
" \"lora_path\": [\"lora0\", \"lora1\"],\n",
|
||||
"}\n",
|
||||
"response = requests.post(\n",
|
||||
" url + \"/generate\",\n",
|
||||
" json=json_data,\n",
|
||||
")\n",
|
||||
"print(f\"Output from lora0: \\n{response.json()[0]['text']}\\n\")\n",
|
||||
"print(f\"Output from lora1 (updated): \\n{response.json()[1]['text']}\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### LoRA GPU Pinning"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Another advanced option is to specify adapters as `pinned` during loading. When an adapter is pinned, it is permanently assigned to one of the available GPU pool slots (as configured by `--max-loras-per-batch`) and will not be evicted from GPU memory during runtime. Instead, it remains resident until it is explicitly unloaded.\n",
|
||||
"\n",
|
||||
"This can improve performance in scenarios where the same adapter is frequently used across requests, by avoiding repeated memory transfers and reinitialization overhead. However, since GPU pool slots are limited, pinning adapters reduces the flexibility of the system to dynamically load other adapters on demand. If too many adapters are pinned, it may lead to degraded performance, or in the most extreme case (`Number of pinned adapters == max-loras-per-batch`), halt all unpinned requests. Therefore, currently SGLang limits maximal number of pinned adapters to `max-loras-per-batch - 1` to prevent unexpected starvations. \n",
|
||||
"\n",
|
||||
"In the example below, we start a server with `lora1` loaded as pinned, `lora2` and `lora3` loaded as regular (unpinned) adapters. Please note that, we intentionally specify `lora2` and `lora3` in two different formats to demonstrate that both are supported."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
" python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
" --enable-lora \\\n",
|
||||
" --cuda-graph-max-bs 8 \\\n",
|
||||
" --max-loras-per-batch 3 --lora-backend triton \\\n",
|
||||
" --max-lora-rank 256 \\\n",
|
||||
" --lora-target-modules all \\\n",
|
||||
" --lora-paths \\\n",
|
||||
" {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n",
|
||||
" {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n",
|
||||
" lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n",
|
||||
" --log-level warning\n",
|
||||
" \"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"url = f\"http://127.0.0.1:{port}\"\n",
|
||||
"wait_for_server(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also specify adapter as pinned during dynamic adapter loading. In the example below, we reload `lora2` as pinned adapter:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(\n",
|
||||
" url + \"/unload_lora_adapter\",\n",
|
||||
" json={\n",
|
||||
" \"lora_name\": \"lora1\",\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = requests.post(\n",
|
||||
" url + \"/load_lora_adapter\",\n",
|
||||
" json={\n",
|
||||
" \"lora_name\": \"lora1\",\n",
|
||||
" \"lora_path\": \"algoprog/fact-generation-llama-3.1-8b-instruct-lora\",\n",
|
||||
" \"pinned\": True, # Pin the adapter to GPU\n",
|
||||
" },\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Verify that the results are expected:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://127.0.0.1:{port}\"\n",
|
||||
"json_data = {\n",
|
||||
" \"text\": [\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" \"List 3 countries and their capitals.\",\n",
|
||||
" ],\n",
|
||||
" \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
|
||||
" # The first input uses lora0, and the second input uses lora1\n",
|
||||
" \"lora_path\": [\"lora0\", \"lora1\", \"lora2\"],\n",
|
||||
"}\n",
|
||||
"response = requests.post(\n",
|
||||
" url + \"/generate\",\n",
|
||||
" json=json_data,\n",
|
||||
")\n",
|
||||
"print(f\"Output from lora0 (pinned): \\n{response.json()[0]['text']}\\n\")\n",
|
||||
"print(f\"Output from lora1 (pinned): \\n{response.json()[1]['text']}\\n\")\n",
|
||||
"print(f\"Output from lora2 (not pinned): \\n{response.json()[2]['text']}\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Future Works\n",
|
||||
"\n",
|
||||
"The development roadmap for LoRA-related features can be found in this [issue](https://github.com/sgl-project/sglang/issues/2929). Other features, including Embedding Layer, Unified Paging, Cutlass backend are still under development."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
35
docs/advanced_features/observability.md
Normal file
35
docs/advanced_features/observability.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Observability
|
||||
|
||||
## Production Metrics
|
||||
SGLang exposes the following metrics via Prometheus. You can enable them by adding `--enable-metrics` when launching the server.
|
||||
You can query them by:
|
||||
```
|
||||
curl http://localhost:30000/metrics
|
||||
```
|
||||
|
||||
See [Production Metrics](../references/production_metrics.md) for more details.
|
||||
|
||||
## Logging
|
||||
|
||||
By default, SGLang does not log any request contents. You can log them by using `--log-requests`.
|
||||
You can control the verbosity by using `--log-request-level`.
|
||||
See [Logging](server_arguments.md#logging) for more details.
|
||||
|
||||
## Request Dump and Replay
|
||||
|
||||
You can dump all requests and replay them later for benchmarking or other purposes.
|
||||
|
||||
To start dumping, use the following command to send a request to a server:
|
||||
```
|
||||
python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000 --dump-requests-folder /tmp/sglang_request_dump --dump-requests-threshold 100
|
||||
```
|
||||
The server will dump the requests into a pickle file for every 100 requests.
|
||||
|
||||
To replay the request dump, use `scripts/playground/replay_request_dump.py`.
|
||||
|
||||
## Crash Dump and Replay
|
||||
Sometimes the server might crash, and you may want to debug the cause of the crash.
|
||||
SGLang supports crash dumping, which will dump all requests from the 5 minutes before the crash, allowing you to replay the requests and debug the reason later.
|
||||
|
||||
To enable crash dumping, use `--crash-dump-folder /tmp/crash_dump`.
|
||||
To replay the crash dump, use `scripts/playground/replay_request_dump.py`.
|
||||
150
docs/advanced_features/pd_disaggregation.md
Normal file
150
docs/advanced_features/pd_disaggregation.md
Normal file
@@ -0,0 +1,150 @@
|
||||
# PD Disaggregation
|
||||
|
||||
## Why and What is PD Disaggregation?
|
||||
|
||||
Large Language Model (LLM) inference comprises two distinct phases: **Prefill** and **Decode**. The Prefill phase is computation-intensive, processing the entire input sequence, while the Decode phase is memory-intensive, managing the Key-Value (KV) cache for token generation. Traditionally, these phases are handled within a unified engine, where combined scheduling of prefill and decode batches introduces inefficiencies. To address these challenges, we introduce **Prefill and Decoding (PD) Disaggregation** in SGLang.
|
||||
|
||||
### Issues with Unified Scheduling
|
||||
|
||||
The conventional unified engine, which processes prefill and decode batches together, results in two significant problems:
|
||||
|
||||
1. **Prefill Interruption**: Incoming prefill batches frequently interrupt ongoing decode batches, causing substantial delays in token generation.
|
||||
2. **DP Attention Imbalance**: In data-parallel (DP) attention, one DP worker may process a prefill batch while another handles a decode batch simultaneously, leading to increased decode latency.
|
||||
|
||||
PD Disaggregation resolves these by separating the two stages, enabling tailored optimizations for each.
|
||||
|
||||
For the design details, please refer to [link](https://docs.google.com/document/d/1rQXJwKd5b9b1aOzLh98mnyMhBMhlxXA5ATZTHoQrwvc/edit?tab=t.0).
|
||||
|
||||
Currently, we support Mooncake and NIXL as the transfer engine.
|
||||
|
||||
## Router Integration
|
||||
|
||||
For deploying PD disaggregation at scale with load balancing and fault tolerance, SGLang provides a router. The router can distribute requests between prefill and decode instances using various routing policies. For detailed information on setting up routing with PD disaggregation, including configuration options and deployment patterns, see the [SGLang Router documentation](router.md#mode-3-prefill-decode-disaggregation).
|
||||
|
||||
|
||||
## Mooncake
|
||||
### Requirements
|
||||
|
||||
```bash
|
||||
uv pip install mooncake-transfer-engine
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
### Llama Single Node
|
||||
|
||||
```bash
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-ib-device mlx5_roce0
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-ib-device mlx5_roce0
|
||||
$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### DeepSeek Multi-Node
|
||||
|
||||
```bash
|
||||
# prefill 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
# prefill 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
# decode 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
# decode 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
```
|
||||
### Advanced Configuration
|
||||
|
||||
PD Disaggregation with Mooncake supports the following environment variables for fine-grained control over system behavior.
|
||||
|
||||
#### Prefill Server Configuration
|
||||
| Variable | Description | Default |
|
||||
|:--------:|:-----------:|:--------:
|
||||
| **`SGLANG_DISAGGREGATION_THREAD_POOL_SIZE`** | Controls the total number of worker threads for KVCache transfer operations per TP rank | A dynamic value calculated by `int(0.75 * os.cpu_count()) // 8)`, which is limited to be larger than 4 and less than 12 to ensure efficiency and prevent thread race conditions |
|
||||
| **`SGLANG_DISAGGREGATION_QUEUE_SIZE`** | Sets the number of parallel transfer queues. KVCache transfer requests from multiple decode instances will be sharded into these queues so that they can share the threads and the transfer bandwidth at the same time. If it is set to `1`, then we transfer requests one by one according to fcfs strategy | `4` |
|
||||
| **`SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT`** | Timeout (seconds) for receiving destination KV indices during request initialization | `300` |
|
||||
|
||||
If a greater mean TTFT is acceptable, you can `export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600` (10 minutes) to relax the timeout condition.
|
||||
Please be aware that this setting will cause prefill instances to take a longer time to clean up the affected memory resources when a running decode node loses connection.
|
||||
|
||||
#### Decode Server Configuration
|
||||
| Variable | Description | Default |
|
||||
|:--------:|:-----------:|:--------:
|
||||
| **`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`** | Interval (seconds) between health checks to prefill bootstrap servers | `5.0` |
|
||||
| **`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`** | Consecutive heartbeat failures before marking prefill server offline | `2` |
|
||||
| **`SGLANG_DISAGGREGATION_WAITING_TIMEOUT`** | Timeout (seconds) for receiving KV Cache after request initialization | `300` |
|
||||
|
||||
If a greater mean TTFT is acceptable, you can `export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600` (10 minutes) to relax the timeout condition.
|
||||
|
||||
|
||||
## NIXL
|
||||
### Requirements
|
||||
|
||||
Install via pip.
|
||||
|
||||
```bash
|
||||
pip install nixl
|
||||
```
|
||||
|
||||
Or build from source - may be required if you already have UCX installed.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ai-dynamo/nixl.git
|
||||
cd nixl
|
||||
pip install . --config-settings=setup-args="-Ducx_path=/path/to/ucx"
|
||||
```
|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
### Llama Single Node
|
||||
|
||||
```bash
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-transfer-backend nixl
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-transfer-backend nixl
|
||||
$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### DeepSeek Multi-Node
|
||||
|
||||
```bash
|
||||
# prefill 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
# prefill 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
# decode 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
# decode 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
```
|
||||
|
||||
## ASCEND
|
||||
|
||||
### Usage
|
||||
|
||||
Use ascend backend with [mf_adapter(download link)](https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com:443/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl?AccessKeyId=HPUAXT4YM0U8JNTERLST&Expires=1783151861&Signature=3j10QDUjqk70enaq8lostYV2bEA%3D) and ASCEND_MF_STORE_URL being set
|
||||
|
||||
```bash
|
||||
pip install mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl --force-reinstall
|
||||
export ASCEND_MF_STORE_URL="tcp://xxx.xx.xxx.xxx:xxxx"
|
||||
```
|
||||
Use mooncake backend, more details can be found in mooncake section.
|
||||
```bash
|
||||
export ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE=true
|
||||
```
|
||||
|
||||
|
||||
### Llama Single Node
|
||||
|
||||
```bash
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-transfer-backend ascend
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-transfer-backend ascend
|
||||
$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### DeepSeek Multi-Node
|
||||
|
||||
```bash
|
||||
# prefill 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend ascend --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 1 --node-rank 0 --tp-size 16
|
||||
# decode 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend ascend --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 1 --node-rank 0 --tp-size 16
|
||||
```
|
||||
152
docs/advanced_features/quantization.md
Normal file
152
docs/advanced_features/quantization.md
Normal file
@@ -0,0 +1,152 @@
|
||||
# Quantization
|
||||
|
||||
SGLang supports various quantization methods, including offline quantization and online dynamic quantization.
|
||||
|
||||
Offline quantization loads pre-quantized model weights directly during inference. This is required for quantization methods
|
||||
such as GPTQ and AWQ, which collect and pre-compute various statistics from the original weights using the calibration dataset.
|
||||
|
||||
Online quantization dynamically computes scaling parameters—such as the maximum/minimum values of model weights—during runtime.
|
||||
Like NVIDIA FP8 training's [delayed scaling](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Mixed-precision-training-with-FP8) mechanism, online quantization calculates the appropriate scaling factors
|
||||
on-the-fly to convert high-precision weights into a lower-precision format.
|
||||
|
||||
**Note: For better performance, usability and convenience, offline quantization is recommended over online quantization.**
|
||||
|
||||
If you use a pre-quantized model, do not add `--quantization` to enable online quantization at the same time.
|
||||
For popular pre-quantized models, please visit [ModelCloud](https://huggingface.co/collections/ModelCloud/vortex-673743382af0a52b2a8b9fe2)
|
||||
or [NeuralMagic](https://huggingface.co/collections/neuralmagic) collections on HF for some
|
||||
popular quality validated quantized models. Quantized models must be validated via benchmarks post-quantization
|
||||
to guard against abnormal quantization loss regressions.
|
||||
|
||||
## Offline Quantization
|
||||
|
||||
To load already quantized models, simply load the model weights and config. **Again, if the model has been quantized offline,
|
||||
there's no need to add `--quantization` argument when starting the engine. The quantization method will be parsed from the
|
||||
downloaded Hugging Face config. For example, DeepSeek V3/R1 models are already in FP8, so do not add redundant parameters.**
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 \
|
||||
--port 30000 --host 0.0.0.0
|
||||
```
|
||||
|
||||
Take note, if your model is **per-channel quantized (INT8 or FP8) with per-token dynamic quantization activation**, you can opt to include `--quantization w8a8_int8` or `--quantization w8a8_fp8` to invoke the corresponding CUTLASS int8_kernel or fp8_kernel in sgl-kernel. This action will ignore the Hugging Face config's quantization settings. For instance, with `neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic`, if you execute with `--quantization w8a8_fp8`, the system will use the `W8A8Fp8Config` from SGLang to invoke the sgl-kernel, rather than the `CompressedTensorsConfig` for vLLM kernels.
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic \
|
||||
--quantization w8a8_fp8 \
|
||||
--port 30000 --host 0.0.0.0
|
||||
```
|
||||
|
||||
### Examples of Offline Model Quantization
|
||||
|
||||
#### Using [GPTQModel](https://github.com/ModelCloud/GPTQModel)
|
||||
|
||||
```bash
|
||||
# install
|
||||
pip install gptqmodel --no-build-isolation -v
|
||||
```
|
||||
|
||||
```py
|
||||
from datasets import load_dataset
|
||||
from gptqmodel import GPTQModel, QuantizeConfig
|
||||
|
||||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
|
||||
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
).select(range(1024))["text"]
|
||||
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128) # quantization config
|
||||
model = GPTQModel.load(model_id, quant_config) # load model
|
||||
|
||||
model.quantize(calibration_dataset, batch_size=2) # quantize
|
||||
model.save(quant_path) # save model
|
||||
```
|
||||
|
||||
#### Using [LLM Compressor](https://github.com/vllm-project/llm-compressor/)
|
||||
|
||||
```bash
|
||||
# install
|
||||
pip install llmcompressor
|
||||
```
|
||||
|
||||
Here, we take quantize `meta-llama/Meta-Llama-3-8B-Instruct` to `FP8` as an example to elaborate on how to do offline quantization.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
from llmcompressor.transformers import SparseAutoModelForCausalLM
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
||||
|
||||
# Step 1: Load the original model.
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
|
||||
model = SparseAutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
|
||||
# Step 2: Perform offline quantization.
|
||||
# Step 2.1: Configure the simple PTQ quantization.
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||
|
||||
# Step 2.2: Apply the quantization algorithm.
|
||||
oneshot(model=model, recipe=recipe)
|
||||
|
||||
# Step 3: Save the model.
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
||||
model.save_pretrained(SAVE_DIR)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
|
||||
Then, you can directly use the quantized model with `SGLang`, by using the following command:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path $PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic \
|
||||
--port 30000 --host 0.0.0.0
|
||||
```
|
||||
|
||||
## Online Quantization
|
||||
|
||||
To enable online quantization, you can simply specify `--quantization` in the command line. For example, you can launch the server with the following command to enable `FP8` quantization for model `meta-llama/Meta-Llama-3.1-8B-Instruct`:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--quantization fp8 \
|
||||
--port 30000 --host 0.0.0.0
|
||||
```
|
||||
|
||||
Our team is working on supporting more online quantization methods. SGLang will soon support methods including but not limited to `["awq", "gptq", "marlin", "gptq_marlin", "awq_marlin", "bitsandbytes", "gguf"]`.
|
||||
|
||||
SGLang also supports quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--torchao-config int4wo-128 \
|
||||
--port 30000 --host 0.0.0.0
|
||||
```
|
||||
|
||||
SGLang supports the following quantization methods based on torchao `["int8dq", "int8wo", "fp8wo", "fp8dq-per_tensor", "fp8dq-per_row", "int4wo-32", "int4wo-64", "int4wo-128", "int4wo-256"]`.
|
||||
|
||||
Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `"int8dq"` method currently has some bugs when using together with cuda graph capture. So we suggest to disable cuda graph capture when using `"int8dq"` method. Namely, please use the following command:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--torchao-config int8dq \
|
||||
--disable-cuda-graph \
|
||||
--port 30000 --host 0.0.0.0
|
||||
```
|
||||
|
||||
## Reference
|
||||
|
||||
- [GPTQModel](https://github.com/ModelCloud/GPTQModel)
|
||||
- [LLM Compressor](https://github.com/vllm-project/llm-compressor/)
|
||||
- [Torchao: PyTorch Architecture Optimization](https://github.com/pytorch/ao)
|
||||
- [vLLM Quantization](https://docs.vllm.ai/en/latest/quantization/)
|
||||
445
docs/advanced_features/router.md
Normal file
445
docs/advanced_features/router.md
Normal file
@@ -0,0 +1,445 @@
|
||||
# SGLang Router
|
||||
|
||||
The SGLang Router is a high-performance request distribution system that routes inference requests across multiple SGLang runtime instances. It features cache-aware load balancing, fault tolerance, and support for advanced deployment patterns including data parallelism and prefill-decode disaggregation.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Cache-Aware Load Balancing**: Optimizes cache utilization while maintaining balanced load distribution
|
||||
- **Multiple Routing Policies**: Choose from random, round-robin, cache-aware, or power-of-two policies
|
||||
- **Fault Tolerance**: Automatic retry and circuit breaker mechanisms for resilient operation
|
||||
- **Dynamic Scaling**: Add or remove workers at runtime without service interruption
|
||||
- **Kubernetes Integration**: Native service discovery and pod management
|
||||
- **Prefill-Decode Disaggregation**: Support for disaggregated serving load balancing
|
||||
- **Prometheus Metrics**: Built-in observability and monitoring
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install sglang-router
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
To see all available options:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_server --help # Co-launch router and workers
|
||||
python -m sglang_router.launch_router --help # Launch router only
|
||||
```
|
||||
|
||||
## Deployment Modes
|
||||
|
||||
The router supports three primary deployment patterns:
|
||||
|
||||
1. **Co-launch Mode**: Router and workers launch together (simplest for single-node deployments)
|
||||
2. **Separate Launch Mode**: Router and workers launch independently (best for multi-node setups)
|
||||
3. **Prefill-Decode Disaggregation**: Specialized mode for disaggregated serving
|
||||
|
||||
### Mode 1: Co-launch Router and Workers
|
||||
|
||||
This mode launches both the router and multiple worker instances in a single command. It's the simplest deployment option and replaces the `--dp-size` argument of SGLang Runtime.
|
||||
|
||||
```bash
|
||||
# Launch router with 4 workers
|
||||
python -m sglang_router.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--dp-size 4 \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
#### Sending Requests
|
||||
|
||||
Once the server is ready, send requests to the router endpoint:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Using the /generate endpoint
|
||||
url = "http://localhost:30000/generate"
|
||||
data = {
|
||||
"text": "What is the capital of France?",
|
||||
"sampling_params": {
|
||||
"temperature": 0.7,
|
||||
"max_new_tokens": 100
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(url, json=data)
|
||||
print(response.json())
|
||||
|
||||
# OpenAI-compatible endpoint
|
||||
url = "http://localhost:30000/v1/chat/completions"
|
||||
data = {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"messages": [{"role": "user", "content": "What is the capital of France?"}]
|
||||
}
|
||||
|
||||
response = requests.post(url, json=data)
|
||||
print(response.json())
|
||||
```
|
||||
|
||||
### Mode 2: Separate Launch Mode
|
||||
|
||||
This mode is ideal for multi-node deployments where workers run on different machines.
|
||||
|
||||
#### Step 1: Launch Workers
|
||||
|
||||
On each worker node:
|
||||
|
||||
```bash
|
||||
# Worker node 1
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000
|
||||
|
||||
# Worker node 2
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--host 0.0.0.0 \
|
||||
--port 8001
|
||||
```
|
||||
|
||||
#### Step 2: Launch Router
|
||||
|
||||
On the router node:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--worker-urls http://worker1:8000 http://worker2:8001 \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000 \
|
||||
--policy cache_aware # or random, round_robin, power_of_two
|
||||
```
|
||||
|
||||
### Mode 3: Prefill-Decode Disaggregation
|
||||
|
||||
This advanced mode separates prefill and decode operations for optimized performance:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--pd-disaggregation \
|
||||
--prefill http://prefill1:8000 9000 \
|
||||
--prefill http://prefill2:8001 9001 \
|
||||
--decode http://decode1:8002 \
|
||||
--decode http://decode2:8003 \
|
||||
--prefill-policy cache_aware \
|
||||
--decode-policy round_robin
|
||||
```
|
||||
|
||||
#### Understanding --prefill Arguments
|
||||
|
||||
The `--prefill` flag accepts URLs with optional bootstrap ports:
|
||||
- `--prefill http://server:8000` - No bootstrap port
|
||||
- `--prefill http://server:8000 9000` - Bootstrap port 9000
|
||||
- `--prefill http://server:8000 none` - Explicitly no bootstrap port
|
||||
|
||||
#### Policy Inheritance in PD Mode
|
||||
|
||||
The router intelligently handles policy configuration for prefill and decode nodes:
|
||||
|
||||
1. **Only `--policy` specified**: Both prefill and decode nodes use this policy
|
||||
2. **`--policy` and `--prefill-policy` specified**: Prefill nodes use `--prefill-policy`, decode nodes use `--policy`
|
||||
3. **`--policy` and `--decode-policy` specified**: Prefill nodes use `--policy`, decode nodes use `--decode-policy`
|
||||
4. **All three specified**: Prefill nodes use `--prefill-policy`, decode nodes use `--decode-policy` (main `--policy` is ignored)
|
||||
|
||||
Example with mixed policies:
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--pd-disaggregation \
|
||||
--prefill http://prefill1:8000
|
||||
--prefill http://prefill2:8000 \
|
||||
--decode http://decode1:8001
|
||||
--decode http://decode2:8001 \
|
||||
--policy round_robin \
|
||||
--prefill-policy cache_aware # Prefill uses cache_aware and decode uses round_robin from --policy
|
||||
```
|
||||
|
||||
#### PD Mode with Service Discovery
|
||||
|
||||
For Kubernetes deployments with separate prefill and decode server pools:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--pd-disaggregation \
|
||||
--service-discovery \
|
||||
--prefill-selector app=prefill-server tier=gpu \
|
||||
--decode-selector app=decode-server tier=cpu \
|
||||
--service-discovery-namespace production \
|
||||
--prefill-policy cache_aware \
|
||||
--decode-policy round_robin
|
||||
```
|
||||
|
||||
## Dynamic Scaling
|
||||
|
||||
The router supports runtime scaling through REST APIs:
|
||||
|
||||
### Adding Workers
|
||||
|
||||
```bash
|
||||
# Launch a new worker
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--port 30001
|
||||
|
||||
# Add it to the router
|
||||
curl -X POST "http://localhost:30000/add_worker?url=http://127.0.0.1:30001"
|
||||
```
|
||||
|
||||
### Removing Workers
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:30000/remove_worker?url=http://127.0.0.1:30001"
|
||||
```
|
||||
|
||||
**Note**: When using cache-aware routing, removed workers are cleanly evicted from the routing tree and request queues.
|
||||
|
||||
## Fault Tolerance
|
||||
|
||||
The router includes comprehensive fault tolerance mechanisms:
|
||||
|
||||
### Retry Configuration
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--worker-urls http://worker1:8000 http://worker2:8001 \
|
||||
--retry-max-retries 3 \
|
||||
--retry-initial-backoff-ms 100 \
|
||||
--retry-max-backoff-ms 10000 \
|
||||
--retry-backoff-multiplier 2.0 \
|
||||
--retry-jitter-factor 0.1
|
||||
```
|
||||
|
||||
### Circuit Breaker
|
||||
|
||||
Protects against cascading failures:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--worker-urls http://worker1:8000 http://worker2:8001 \
|
||||
--cb-failure-threshold 5 \
|
||||
--cb-success-threshold 2 \
|
||||
--cb-timeout-duration-secs 30 \
|
||||
--cb-window-duration-secs 60
|
||||
```
|
||||
|
||||
**Behavior**:
|
||||
- Worker is marked unhealthy after `cb-failure-threshold` consecutive failures
|
||||
- Returns to service after `cb-success-threshold` successful health checks
|
||||
- Circuit breaker can be disabled with `--disable-circuit-breaker`
|
||||
|
||||
## Routing Policies
|
||||
|
||||
The router supports multiple routing strategies:
|
||||
|
||||
### 1. Random Routing
|
||||
Distributes requests randomly across workers.
|
||||
|
||||
```bash
|
||||
--policy random
|
||||
```
|
||||
|
||||
### 2. Round-Robin Routing
|
||||
Cycles through workers in order.
|
||||
|
||||
```bash
|
||||
--policy round_robin
|
||||
```
|
||||
|
||||
### 3. Power of Two Choices
|
||||
Samples two workers and routes to the less loaded one.
|
||||
|
||||
```bash
|
||||
--policy power_of_two
|
||||
```
|
||||
|
||||
### 4. Cache-Aware Load Balancing (Default)
|
||||
|
||||
The most sophisticated policy that combines cache optimization with load balancing:
|
||||
|
||||
```bash
|
||||
--policy cache_aware \
|
||||
--cache-threshold 0.5 \
|
||||
--balance-abs-threshold 32 \
|
||||
--balance-rel-threshold 1.0001
|
||||
```
|
||||
|
||||
#### How It Works
|
||||
|
||||
1. **Load Assessment**: Checks if the system is balanced
|
||||
- Imbalanced if: `(max_load - min_load) > balance_abs_threshold` AND `max_load > balance_rel_threshold * min_load`
|
||||
|
||||
2. **Routing Decision**:
|
||||
- **Balanced System**: Uses cache-aware routing
|
||||
- Routes to worker with highest prefix match if match > `cache_threshold`
|
||||
- Otherwise routes to worker with most available cache capacity
|
||||
- **Imbalanced System**: Uses shortest queue routing to the least busy worker
|
||||
|
||||
3. **Cache Management**:
|
||||
- Maintains approximate radix trees per worker
|
||||
- Periodically evicts LRU entries based on `--eviction-interval-secs` and `--max-tree-size`
|
||||
|
||||
### Data Parallelism Aware Routing
|
||||
|
||||
Enables fine-grained control over data parallel replicas:
|
||||
|
||||
```bash
|
||||
--dp-aware \
|
||||
--api-key your_api_key # Required for worker authentication
|
||||
```
|
||||
|
||||
This mode coordinates with SGLang's DP controller for optimized request distribution across data parallel ranks.
|
||||
|
||||
## Configuration Reference
|
||||
|
||||
### Core Settings
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| --------------------------- | ---- | ----------- | --------------------------------------------------------------- |
|
||||
| `--host` | str | 127.0.0.1 | Router server host address |
|
||||
| `--port` | int | 30000 | Router server port |
|
||||
| `--worker-urls` | list | [] | Worker URLs for separate launch mode |
|
||||
| `--policy` | str | cache_aware | Routing policy (random, round_robin, cache_aware, power_of_two) |
|
||||
| `--max-concurrent-requests` | int | 64 | Maximum concurrent requests (rate limiting) |
|
||||
| `--request-timeout-secs` | int | 600 | Request timeout in seconds |
|
||||
| `--max-payload-size` | int | 256MB | Maximum request payload size |
|
||||
|
||||
### Cache-Aware Routing Parameters
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| -------------------------- | ----- | -------- | ------------------------------------------------------ |
|
||||
| `--cache-threshold` | float | 0.5 | Minimum prefix match ratio for cache routing (0.0-1.0) |
|
||||
| `--balance-abs-threshold` | int | 32 | Absolute load difference threshold |
|
||||
| `--balance-rel-threshold` | float | 1.0001 | Relative load ratio threshold |
|
||||
| `--eviction-interval-secs` | int | 60 | Seconds between cache eviction cycles |
|
||||
| `--max-tree-size` | int | 16777216 | Maximum nodes in routing tree |
|
||||
|
||||
### Fault Tolerance Parameters
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| ---------------------------- | ----- | ------- | ------------------------------------- |
|
||||
| `--retry-max-retries` | int | 3 | Maximum retry attempts per request |
|
||||
| `--retry-initial-backoff-ms` | int | 100 | Initial retry backoff in milliseconds |
|
||||
| `--retry-max-backoff-ms` | int | 10000 | Maximum retry backoff in milliseconds |
|
||||
| `--retry-backoff-multiplier` | float | 2.0 | Backoff multiplier between retries |
|
||||
| `--retry-jitter-factor` | float | 0.1 | Random jitter factor for retries |
|
||||
| `--disable-retries` | flag | False | Disable retry mechanism |
|
||||
| `--cb-failure-threshold` | int | 5 | Failures before circuit opens |
|
||||
| `--cb-success-threshold` | int | 2 | Successes to close circuit |
|
||||
| `--cb-timeout-duration-secs` | int | 30 | Circuit breaker timeout duration |
|
||||
| `--cb-window-duration-secs` | int | 60 | Circuit breaker window duration |
|
||||
| `--disable-circuit-breaker` | flag | False | Disable circuit breaker |
|
||||
|
||||
### Prefill-Decode Disaggregation Parameters
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| --------------------------------- | ---- | ------- | ----------------------------------------------------- |
|
||||
| `--pd-disaggregation` | flag | False | Enable PD disaggregated mode |
|
||||
| `--prefill` | list | [] | Prefill server URLs with optional bootstrap ports |
|
||||
| `--decode` | list | [] | Decode server URLs |
|
||||
| `--prefill-policy` | str | None | Routing policy for prefill nodes (overrides --policy) |
|
||||
| `--decode-policy` | str | None | Routing policy for decode nodes (overrides --policy) |
|
||||
| `--worker-startup-timeout-secs` | int | 300 | Timeout for worker startup |
|
||||
| `--worker-startup-check-interval` | int | 10 | Interval between startup checks |
|
||||
|
||||
### Kubernetes Integration
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| ------------------------------- | ---- | ------------------------ | ---------------------------------------------------- |
|
||||
| `--service-discovery` | flag | False | Enable Kubernetes service discovery |
|
||||
| `--selector` | list | [] | Label selector for workers (key1=value1 key2=value2) |
|
||||
| `--prefill-selector` | list | [] | Label selector for prefill servers in PD mode |
|
||||
| `--decode-selector` | list | [] | Label selector for decode servers in PD mode |
|
||||
| `--service-discovery-port` | int | 80 | Port for discovered pods |
|
||||
| `--service-discovery-namespace` | str | None | Kubernetes namespace to watch |
|
||||
| `--bootstrap-port-annotation` | str | sglang.ai/bootstrap-port | Annotation for bootstrap ports |
|
||||
|
||||
### Observability
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| ---------------------- | ---- | --------- | ----------------------------------------------------- |
|
||||
| `--prometheus-port` | int | 29000 | Prometheus metrics port |
|
||||
| `--prometheus-host` | str | 127.0.0.1 | Prometheus metrics host |
|
||||
| `--log-dir` | str | None | Directory for log files |
|
||||
| `--log-level` | str | info | Logging level (debug, info, warning, error, critical) |
|
||||
| `--request-id-headers` | list | None | Custom headers for request tracing |
|
||||
|
||||
### CORS Configuration
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| ------------------------ | ---- | ------- | -------------------- |
|
||||
| `--cors-allowed-origins` | list | [] | Allowed CORS origins |
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Kubernetes Service Discovery
|
||||
|
||||
Automatically discover and manage workers in Kubernetes:
|
||||
|
||||
#### Standard Mode
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--service-discovery \
|
||||
--selector app=sglang-worker env=prod \
|
||||
--service-discovery-namespace production \
|
||||
--service-discovery-port 8000
|
||||
```
|
||||
|
||||
#### Prefill-Decode Disaggregation Mode
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--pd-disaggregation \
|
||||
--service-discovery \
|
||||
--prefill-selector app=prefill-server env=prod \
|
||||
--decode-selector app=decode-server env=prod \
|
||||
--service-discovery-namespace production
|
||||
```
|
||||
|
||||
**Note**: The `--bootstrap-port-annotation` (default: `sglang.ai/bootstrap-port`) is used to discover bootstrap ports for prefill servers in PD mode. Prefill pods should have this annotation set to their bootstrap port value.
|
||||
|
||||
### Prometheus Metrics
|
||||
|
||||
Expose metrics for monitoring:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--worker-urls http://worker1:8000 http://worker2:8001 \
|
||||
--prometheus-port 29000 \
|
||||
--prometheus-host 0.0.0.0
|
||||
```
|
||||
|
||||
Metrics available at `http://localhost:29000/metrics`
|
||||
|
||||
### Request Tracing
|
||||
|
||||
Enable request ID tracking:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--worker-urls http://worker1:8000 http://worker2:8001 \
|
||||
--request-id-headers x-request-id x-trace-id
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Workers not connecting**: Ensure workers are fully initialized before starting the router. Use `--worker-startup-timeout-secs` to increase wait time.
|
||||
|
||||
2. **High latency**: Check if cache-aware routing is causing imbalance. Try adjusting `--balance-abs-threshold` and `--balance-rel-threshold`.
|
||||
|
||||
3. **Memory growth**: Reduce `--max-tree-size` or decrease `--eviction-interval-secs` for more aggressive cache cleanup.
|
||||
|
||||
4. **Circuit breaker triggering frequently**: Increase `--cb-failure-threshold` or extend `--cb-window-duration-secs`.
|
||||
|
||||
### Debug Mode
|
||||
|
||||
Enable detailed logging:
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_router \
|
||||
--worker-urls http://worker1:8000 http://worker2:8001 \
|
||||
--log-level debug \
|
||||
--log-dir ./router_logs
|
||||
```
|
||||
381
docs/advanced_features/separate_reasoning.ipynb
Normal file
381
docs/advanced_features/separate_reasoning.ipynb
Normal file
@@ -0,0 +1,381 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Reasoning Parser\n",
|
||||
"\n",
|
||||
"SGLang supports parsing reasoning content out from \"normal\" content for reasoning models such as [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1).\n",
|
||||
"\n",
|
||||
"## Supported Models & Parsers\n",
|
||||
"\n",
|
||||
"| Model | Reasoning tags | Parser | Notes |\n",
|
||||
"|---------|-----------------------------|------------------|-------|\n",
|
||||
"| [DeepSeek‑R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `<think>` … `</think>` | `deepseek-r1` | Supports all variants (R1, R1-0528, R1-Distill) |\n",
|
||||
"| [DeepSeek‑V3.1](https://huggingface.co/deepseek-ai/DeepSeek-V3.1) | `<think>` … `</think>` | `deepseek-v3` | Supports `thinking` parameter |\n",
|
||||
"| [Standard Qwen3 models](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `<think>` … `</think>` | `qwen3` | Supports `enable_thinking` parameter |\n",
|
||||
"| [Qwen3-Thinking models](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) | `<think>` … `</think>` | `qwen3` or `qwen3-thinking` | Always generates thinking content |\n",
|
||||
"| [Kimi models](https://huggingface.co/moonshotai/models) | `◁think▷` … `◁/think▷` | `kimi` | Uses special thinking delimiters |\n",
|
||||
"| [GPT OSS](https://huggingface.co/openai/gpt-oss-120b) | `<\\|channel\\|>analysis<\\|message\\|>` … `<\\|end\\|>` | `gpt-oss` | N/A |\n",
|
||||
"### Model-Specific Behaviors\n",
|
||||
"\n",
|
||||
"**DeepSeek-R1 Family:**\n",
|
||||
"- DeepSeek-R1: No `<think>` start tag, jumps directly to thinking content\n",
|
||||
"- DeepSeek-R1-0528: Generates both `<think>` start and `</think>` end tags\n",
|
||||
"- Both are handled by the same `deepseek-r1` parser\n",
|
||||
"\n",
|
||||
"**DeepSeek-V3 Family:**\n",
|
||||
"- DeepSeek-V3.1: Hybrid model supporting both thinking and non-thinking modes, use the `deepseek-v3` parser and `thinking` parameter (NOTE: not `enable_thinking`)\n",
|
||||
"\n",
|
||||
"**Qwen3 Family:**\n",
|
||||
"- Standard Qwen3 (e.g., Qwen3-2507): Use `qwen3` parser, supports `enable_thinking` in chat templates\n",
|
||||
"- Qwen3-Thinking (e.g., Qwen3-235B-A22B-Thinking-2507): Use `qwen3` or `qwen3-thinking` parser, always thinks\n",
|
||||
"\n",
|
||||
"**Kimi:**\n",
|
||||
"- Kimi: Uses special `◁think▷` and `◁/think▷` tags\n",
|
||||
"\n",
|
||||
"**GPT OSS:**\n",
|
||||
"- GPT OSS: Uses special `<|channel|>analysis<|message|>` and `<|end|>` tags"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage\n",
|
||||
"\n",
|
||||
"### Launching the Server"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Specify the `--reasoning-parser` option."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that `--reasoning-parser` defines the parser used to interpret responses."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### OpenAI Compatible API\n",
|
||||
"\n",
|
||||
"Using the OpenAI compatible API, the contract follows the [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model) established with the release of DeepSeek-R1:\n",
|
||||
"\n",
|
||||
"- `reasoning_content`: The content of the CoT.\n",
|
||||
"- `content`: The content of the final answer."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize OpenAI-like client\n",
|
||||
"client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
|
||||
"model_name = client.models.list().data[0].id\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"What is 1+3?\",\n",
|
||||
" }\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Non-Streaming Request"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response_non_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.6,\n",
|
||||
" top_p=0.95,\n",
|
||||
" stream=False, # Non-streaming\n",
|
||||
" extra_body={\"separate_reasoning\": True},\n",
|
||||
")\n",
|
||||
"print_highlight(\"==== Reasoning ====\")\n",
|
||||
"print_highlight(response_non_stream.choices[0].message.reasoning_content)\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(response_non_stream.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Streaming Request"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.6,\n",
|
||||
" top_p=0.95,\n",
|
||||
" stream=True, # Non-streaming\n",
|
||||
" extra_body={\"separate_reasoning\": True},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"reasoning_content = \"\"\n",
|
||||
"content = \"\"\n",
|
||||
"for chunk in response_stream:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" content += chunk.choices[0].delta.content\n",
|
||||
" if chunk.choices[0].delta.reasoning_content:\n",
|
||||
" reasoning_content += chunk.choices[0].delta.reasoning_content\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Reasoning ====\")\n",
|
||||
"print_highlight(reasoning_content)\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Optionally, you can buffer the reasoning content to the last reasoning chunk (or the first chunk after the reasoning content)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.6,\n",
|
||||
" top_p=0.95,\n",
|
||||
" stream=True, # Non-streaming\n",
|
||||
" extra_body={\"separate_reasoning\": True, \"stream_reasoning\": False},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"reasoning_content = \"\"\n",
|
||||
"content = \"\"\n",
|
||||
"for chunk in response_stream:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" content += chunk.choices[0].delta.content\n",
|
||||
" if chunk.choices[0].delta.reasoning_content:\n",
|
||||
" reasoning_content += chunk.choices[0].delta.reasoning_content\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Reasoning ====\")\n",
|
||||
"print_highlight(reasoning_content)\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The reasoning separation is enable by default when specify . \n",
|
||||
"**To disable it, set the `separate_reasoning` option to `False` in request.**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response_non_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.6,\n",
|
||||
" top_p=0.95,\n",
|
||||
" stream=False, # Non-streaming\n",
|
||||
" extra_body={\"separate_reasoning\": False},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Original Output ====\")\n",
|
||||
"print_highlight(response_non_stream.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### SGLang Native API "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
|
||||
"input = tokenizer.apply_chat_template(\n",
|
||||
" messages,\n",
|
||||
" tokenize=False,\n",
|
||||
" add_generation_prompt=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"gen_url = f\"http://localhost:{port}/generate\"\n",
|
||||
"gen_data = {\n",
|
||||
" \"text\": input,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"skip_special_tokens\": False,\n",
|
||||
" \"max_new_tokens\": 1024,\n",
|
||||
" \"temperature\": 0.6,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Original Output ====\")\n",
|
||||
"print_highlight(gen_response)\n",
|
||||
"\n",
|
||||
"parse_url = f\"http://localhost:{port}/separate_reasoning\"\n",
|
||||
"separate_reasoning_data = {\n",
|
||||
" \"text\": gen_response,\n",
|
||||
" \"reasoning_parser\": \"deepseek-r1\",\n",
|
||||
"}\n",
|
||||
"separate_reasoning_response_json = requests.post(\n",
|
||||
" parse_url, json=separate_reasoning_data\n",
|
||||
").json()\n",
|
||||
"print_highlight(\"==== Reasoning ====\")\n",
|
||||
"print_highlight(separate_reasoning_response_json[\"reasoning_text\"])\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(separate_reasoning_response_json[\"text\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Offline Engine API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sglang as sgl\n",
|
||||
"from sglang.srt.parser.reasoning_parser import ReasoningParser\n",
|
||||
"from sglang.utils import print_highlight\n",
|
||||
"\n",
|
||||
"llm = sgl.Engine(model_path=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
|
||||
"input = tokenizer.apply_chat_template(\n",
|
||||
" messages,\n",
|
||||
" tokenize=False,\n",
|
||||
" add_generation_prompt=True,\n",
|
||||
")\n",
|
||||
"sampling_params = {\n",
|
||||
" \"max_new_tokens\": 1024,\n",
|
||||
" \"skip_special_tokens\": False,\n",
|
||||
" \"temperature\": 0.6,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
"}\n",
|
||||
"result = llm.generate(prompt=input, sampling_params=sampling_params)\n",
|
||||
"\n",
|
||||
"generated_text = result[\"text\"] # Assume there is only one prompt\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Original Output ====\")\n",
|
||||
"print_highlight(generated_text)\n",
|
||||
"\n",
|
||||
"parser = ReasoningParser(\"deepseek-r1\")\n",
|
||||
"reasoning_text, text = parser.parse_non_stream(generated_text)\n",
|
||||
"print_highlight(\"==== Reasoning ====\")\n",
|
||||
"print_highlight(reasoning_text)\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm.shutdown()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Supporting New Reasoning Model Schemas\n",
|
||||
"\n",
|
||||
"For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningFormatDetector` in `python/sglang/srt/reasoning_parser.py` and specify the reasoning parser for new reasoning model schemas accordingly."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
318
docs/advanced_features/server_arguments.md
Normal file
318
docs/advanced_features/server_arguments.md
Normal file
@@ -0,0 +1,318 @@
|
||||
# Server Arguments
|
||||
|
||||
This page provides a list of server arguments used in the command line to configure the behavior
|
||||
and performance of the language model server during deployment. These arguments enable users to
|
||||
customize key aspects of the server, including model selection, parallelism policies,
|
||||
memory management, and optimization techniques.
|
||||
You can find all arguments by `python3 -m sglang.launch_server --help`
|
||||
|
||||
## Common launch commands
|
||||
|
||||
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 2
|
||||
```
|
||||
|
||||
- To enable multi-GPU data parallelism, add `--dp 2`. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total. We recommend [SGLang Router](../advanced_features/router.md) for data parallelism.
|
||||
|
||||
```bash
|
||||
python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --dp 2 --tp 2
|
||||
```
|
||||
|
||||
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
|
||||
```
|
||||
|
||||
- See [hyperparameter tuning](hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
||||
- For docker and Kubernetes runs, you need to set up shared memory which is used for communication between processes. See `--shm-size` for docker and `/dev/shm` size update for Kubernetes manifests.
|
||||
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
||||
```
|
||||
|
||||
- To enable `torch.compile` acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. By default, the cache path is located at `/tmp/torchinductor_root`, you can customize it using environment variable `TORCHINDUCTOR_CACHE_DIR`. For more details, please refer to [PyTorch official documentation](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) and [Enabling cache for torch.compile](https://docs.sglang.ai/backend/hyperparameter_tuning.html#enabling-cache-for-torch-compile).
|
||||
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports other [quantization strategies (INT8/FP8)](https://github.com/sgl-project/sglang/blob/v0.3.6/python/sglang/srt/server_args.py#L671) as well.
|
||||
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
||||
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](../references/custom_chat_template.md).
|
||||
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
||||
|
||||
```bash
|
||||
# Node 0
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
||||
|
||||
# Node 1
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
||||
```
|
||||
|
||||
Please consult the documentation below and [server_args.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py) to learn more about the arguments you may provide when launching a server.
|
||||
|
||||
## Model and tokenizer
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--model-path` | The path of the model weights. This can be a local folder or a Hugging Face repo ID. | None |
|
||||
| `--tokenizer-path` | The path of the tokenizer. | None |
|
||||
| `--tokenizer-mode` | Tokenizer mode. 'auto' will use the fast tokenizer if available, and 'slow' will always use the slow tokenizer. | auto |
|
||||
| `--skip-tokenizer-init` | If set, skip init tokenizer and pass input_ids in generate request. | False |
|
||||
| `--load-format` | The format of the model weights to load. 'auto' will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available. 'pt' will load the weights in the pytorch bin format. 'safetensors' will load the weights in the safetensors format. 'npcache' will load the weights in pytorch format and store a numpy cache to speed up the loading. 'dummy' will initialize the weights with random values, which is mainly for profiling. 'gguf' will load the weights in the gguf format. 'bitsandbytes' will load the weights using bitsandbytes quantization. 'layered' loads weights layer by layer so that one can quantize a layer before loading another to make the peak memory envelope smaller. | auto |
|
||||
| `--trust-remote-code` | Whether or not to allow for custom models defined on the Hub in their own modeling files. | False |
|
||||
| `--context-length` | The model's maximum context length. Defaults to None (will use the value from the model's config.json instead). | None |
|
||||
| `--is-embedding` | Whether to use a CausalLM as an embedding model. | False |
|
||||
| `--enable-multimodal` | Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen. | None |
|
||||
| `--revision` | The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. | None |
|
||||
| `--model-impl` | Which implementation of the model to use. 'auto' will try to use the SGLang implementation if it exists and fall back to the Transformers implementation if no SGLang implementation is available. 'sglang' will use the SGLang model implementation. 'transformers' will use the Transformers model implementation. | auto |
|
||||
|
||||
## HTTP server
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--host` | The host address for the server. | 127.0.0.1 |
|
||||
| `--port` | The port number for the server. | 30000 |
|
||||
| `--skip-server-warmup` | If set, skip the server warmup process. | False |
|
||||
| `--warmups` | Warmup configurations. | None |
|
||||
| `--nccl-port` | The port for NCCL initialization. | None |
|
||||
|
||||
## Quantization and data type
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--dtype` | Data type for model weights and activations. 'auto' will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. 'half' for FP16. Recommended for AWQ quantization. 'float16' is the same as 'half'. 'bfloat16' for a balance between precision and range. 'float' is shorthand for FP32 precision. 'float32' for FP32 precision. | auto |
|
||||
| `--quantization` | The quantization method. | None |
|
||||
| `--quantization-param-path` | Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. | None |
|
||||
| `--kv-cache-dtype` | Data type for kv cache storage. 'auto' will use model data type. 'fp8_e5m2' and 'fp8_e4m3' is supported for CUDA 11.8+. | auto |
|
||||
|
||||
## Memory and scheduling
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--mem-fraction-static` | The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors. | None |
|
||||
| `--max-running-requests` | The maximum number of running requests. | None |
|
||||
| `--max-total-tokens` | The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes. | None |
|
||||
| `--chunked-prefill-size` | The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill. | None |
|
||||
| `--max-prefill-tokens` | The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length. | 16384 |
|
||||
| `--schedule-policy` | The scheduling policy of the requests. | fcfs |
|
||||
| `--schedule-conservativeness` | How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently. | 1.0 |
|
||||
| `--cpu-offload-gb` | How many GBs of RAM to reserve for CPU offloading. | 0 |
|
||||
| `--page-size` | The number of tokens in a page. | 1 |
|
||||
|
||||
## Runtime options
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | None |
|
||||
| `--tp-size` | The tensor parallelism size. | 1 |
|
||||
| `--pp-size` | The pipeline parallelism size. | 1 |
|
||||
| `--max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None |
|
||||
| `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher. | 1 |
|
||||
| `--stream-output` | Whether to output as a sequence of disjoint segments. | False |
|
||||
| `--random-seed` | The random seed. | None |
|
||||
| `--constrained-json-whitespace-pattern` | Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*. | None |
|
||||
| `--watchdog-timeout` | Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging. | 300 |
|
||||
| `--dist-timeout` | Set timeout for torch.distributed initialization. | None |
|
||||
| `--download-dir` | Model download directory for huggingface. | None |
|
||||
| `--base-gpu-id` | The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine. | 0 |
|
||||
| `--gpu-id-step` | The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,.... | 1 |
|
||||
| `--sleep-on-idle` | Reduce CPU usage when sglang is idle. | False |
|
||||
|
||||
## Logging
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|---------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
|
||||
| `--log-level` | The logging level of all loggers. | info |
|
||||
| `--log-level-http` | The logging level of HTTP server. If not set, reuse --log-level by default. | None |
|
||||
| `--log-requests` | Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level. | False |
|
||||
| `--log-requests-level` | 0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output. | 0 |
|
||||
| `--show-time-cost` | Show time cost of custom marks. | False |
|
||||
| `--enable-metrics` | Enable log prometheus metrics. | False |
|
||||
| `--bucket-time-to-first-token` | The buckets of time to first token, specified as a list of floats. | None |
|
||||
| `--bucket-inter-token-latency` | The buckets of inter-token latency, specified as a list of floats. | None |
|
||||
| `--bucket-e2e-request-latency` | The buckets of end-to-end request latency, specified as a list of floats. | None |
|
||||
| `--collect-tokens-histogram` | Collect prompt/generation tokens histogram. | False |
|
||||
| `--kv-events-config` | Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used. | None |
|
||||
| `--decode-log-interval` | The log interval of decode batch. | 40 |
|
||||
| `--enable-request-time-stats-logging` | Enable per request time stats logging. | False |
|
||||
| `--prompt-tokens-buckets` | The buckets rule of prompt tokens. Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets [984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> <value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500'). | None |
|
||||
| `--generation-tokens-buckets` | The buckets rule of prompt tokens. Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets [984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> <value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500'). | None |
|
||||
|
||||
## API related
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--api-key` | Set API key of the server. It is also used in the OpenAI API compatible server. | None |
|
||||
| `--served-model-name` | Override the model name returned by the v1/models endpoint in OpenAI API server. | None |
|
||||
| `--chat-template` | The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. | None |
|
||||
| `--completion-template` | The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently. | None |
|
||||
| `--file-storage-path` | The path of the file storage in backend. | sglang_storage |
|
||||
| `--enable-cache-report` | Return number of cached tokens in usage.prompt_tokens_details for each openai request. | False |
|
||||
| `--reasoning-parser` | Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}. | None |
|
||||
| `--tool-call-parser` | Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'. | None |
|
||||
|
||||
## Data parallelism
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--dp-size` | The data parallelism size. | 1 |
|
||||
| `--load-balance-method` | The load balancing strategy for data parallelism. Options include: 'round_robin', 'minimum_tokens'. The Minimum Token algorithm can only be used when DP attention is applied. This algorithm performs load balancing based on the real-time token load of the DP workers. | round_robin |
|
||||
|
||||
## Multi-node distributed serving
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--dist-init-addr` | The host address for initializing distributed backend (e.g., `192.168.0.2:25000`). | None |
|
||||
| `--nnodes` | The number of nodes. | 1 |
|
||||
| `--node-rank` | The node rank. | 0 |
|
||||
|
||||
## Model override args in JSON
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--json-model-override-args` | A dictionary in JSON string format used to override default model configurations. | {} |
|
||||
| `--preferred-sampling-params` | json-formatted sampling settings that will be returned in /get_model_info. | None |
|
||||
|
||||
## LoRA
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--enable-lora` | Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility. | False |
|
||||
| `--max-lora-rank` | The maximum LoRA rank that should be supported. If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of larger LoRA rank after server startup. | None |
|
||||
| `--lora-target-modules` | The union set of all target modules where LoRA should be applied (e.g., `q_proj`, `k_proj`, `gate_proj`). If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of different target modules after server startup. You can also set it to `all` to enable LoRA for all supported modules. However, enabling LoRA on additional modules introduces a minor performance overhead. If your application is performance-sensitive, we recommend only specifying the modules for which you plan to load adapters. | None |
|
||||
| `--lora-paths` | The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool} | None |
|
||||
| `--max-loras-per-batch` | Maximum number of adapters for a running batch, include base-only request. | 8 |
|
||||
| `--max-loaded-loras` | If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`. | None |
|
||||
| `--lora-backend` | Choose the kernel backend for multi-LoRA serving. | triton |
|
||||
|
||||
## Kernel backend
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--attention-backend` | Choose the kernels for attention layers. | None |
|
||||
| `--prefill-attention-backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None |
|
||||
| `--decode-attention-backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None |
|
||||
| `--sampling-backend` | Choose the kernels for sampling layers. | None |
|
||||
| `--grammar-backend` | Choose the backend for grammar-guided decoding. | None |
|
||||
| `--mm-attention-backend` | Set multimodal attention backend. | None |
|
||||
|
||||
## Speculative decoding
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--speculative-algorithm` | Speculative algorithm. | None |
|
||||
| `--speculative-draft-model-path` | The path of the draft model weights. This can be a local folder or a Hugging Face repo ID. | None |
|
||||
| `--speculative-num-steps` | The number of steps sampled from draft model in Speculative Decoding. | None |
|
||||
| `--speculative-eagle-topk` | The number of tokens sampled from the draft model in eagle2 each step. | None |
|
||||
| `--speculative-num-draft-tokens` | The number of tokens sampled from the draft model in Speculative Decoding. | None |
|
||||
| `--speculative-accept-threshold-single` | Accept a draft token if its probability in the target model is greater than this threshold. | 1.0 |
|
||||
| `--speculative-accept-threshold-acc` | The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc). | 1.0 |
|
||||
| `--speculative-token-map` | The path of the draft model's small vocab table. | None |
|
||||
| `--speculative-attention-mode` | Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'. | Prefill |
|
||||
|
||||
## Expert parallelism
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--ep-size` | The expert parallelism size. | 1 |
|
||||
| `--moe-a2a-backend` | Select the backend for all-to-all communication for expert parallelism. | none |
|
||||
| `--moe-runner-backend` | Select the runner backend for MoE. | 'triton' |
|
||||
| `--deepep-mode` | Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch. | auto |
|
||||
| `--ep-num-redundant-experts` | Allocate this number of redundant experts in expert parallel. | 0 |
|
||||
| `--ep-dispatch-algorithm` | The algorithm to choose ranks for redundant experts in EPLB. | None |
|
||||
| `--init-expert-location` | Initial location of EP experts. | trivial |
|
||||
| `--enable-eplb` | Enable EPLB algorithm. | False |
|
||||
| `--eplb-algorithm` | Chosen EPLB algorithm. | auto |
|
||||
| `--eplb-rebalance-num-iterations` | Number of iterations to automatically trigger a EPLB re-balance. | 1000 |
|
||||
| `--eplb-rebalance-layers-per-chunk` | Number of layers to rebalance per forward pass. | None |
|
||||
| `--expert-distribution-recorder-mode` | Mode of expert distribution recorder. | None |
|
||||
| `--expert-distribution-recorder-buffer-size` | Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer. | None |
|
||||
| `--enable-expert-distribution-metrics` | Enable logging metrics for expert balancedness. | False |
|
||||
| `--deepep-config` | Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path. | None |
|
||||
| `--moe-dense-tp-size` | TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports. | None |
|
||||
|
||||
## Hierarchical cache
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--enable-hierarchical-cache` | Enable hierarchical cache. | False |
|
||||
| `--hicache-ratio` | The ratio of the size of host KV cache memory pool to the size of device pool. | 2.0 |
|
||||
| `--hicache-size` | The size of the hierarchical cache. | 0 |
|
||||
| `--hicache-write-policy` | The write policy for hierarchical cache. | write_through |
|
||||
| `--hicache-io-backend` | The IO backend for hierarchical cache. | |
|
||||
| `--hicache-storage-backend` | The storage backend for hierarchical cache. | None |
|
||||
|
||||
## Optimization/debug options
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--disable-radix-cache` | Disable RadixAttention for prefix caching. | False |
|
||||
| `--cuda-graph-max-bs` | Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value. | None |
|
||||
| `--cuda-graph-bs` | Set the list of batch sizes for cuda graph. | None |
|
||||
| `--disable-cuda-graph` | Disable cuda graph. | False |
|
||||
| `--disable-cuda-graph-padding` | Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed. | False |
|
||||
| `--enable-profile-cuda-graph` | Enable profiling of cuda graph capture. | False |
|
||||
| `--enable-nccl-nvls` | Enable NCCL NVLS for prefill heavy requests when available. | False |
|
||||
| `--enable-symm-mem` | Enable NCCL symmetric memory for fast collectives. | False |
|
||||
| `--enable-tokenizer-batch-encode` | Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds. | False |
|
||||
| `--disable-outlines-disk-cache` | Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency. | False |
|
||||
| `--disable-custom-all-reduce` | Disable the custom all-reduce kernel and fall back to NCCL. | False |
|
||||
| `--enable-mscclpp` | Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL. | False |
|
||||
| `--disable-overlap-schedule` | Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker. | False |
|
||||
| `--enable-mixed-chunk` | Enabling mixing prefill and decode in a batch when using chunked prefill. | False |
|
||||
| `--enable-dp-attention` | Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported. | False |
|
||||
| `--enable-dp-lm-head` | Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention. | False |
|
||||
| `--enable-two-batch-overlap` | Enabling two micro batches to overlap. | False |
|
||||
| `--tbo-token-distribution-threshold` | The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap. | 0.48 |
|
||||
| `--enable-torch-compile` | Optimize the model with torch.compile. Experimental feature. | False |
|
||||
| `--torch-compile-max-bs` | Set the maximum batch size when using torch compile. | 32 |
|
||||
| `--torchao-config` | Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row. | |
|
||||
| `--enable-nan-detection` | Enable the NaN detection for debugging purposes. | False |
|
||||
| `--enable-p2p-check` | Enable P2P check for GPU access, otherwise the p2p access is allowed by default. | False |
|
||||
| `--triton-attention-reduce-in-fp32` | Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16. This only affects Triton attention kernels. | False |
|
||||
| `--triton-attention-num-kv-splits` | The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8. | 8 |
|
||||
| `--num-continuous-decode-steps` | Run multiple continuous decoding steps to reduce scheduling overhead. This can potentially increase throughput but may also increase time-to-first-token latency. The default value is 1, meaning only run one decoding step at a time. | 1 |
|
||||
| `--delete-ckpt-after-loading` | Delete the model checkpoint after loading the model. | False |
|
||||
| `--enable-memory-saver` | Allow saving memory using release_memory_occupation and resume_memory_occupation. | False |
|
||||
| `--allow-auto-truncate` | Allow automatically truncating requests that exceed the maximum input length instead of returning an error. | False |
|
||||
| `--enable-custom-logit-processor` | Enable users to pass custom logit processors to the server (disabled by default for security). | False |
|
||||
| `--flashinfer-mla-disable-ragged` | Disable ragged processing in Flashinfer MLA. | False |
|
||||
| `--disable-shared-experts-fusion` | Disable shared experts fusion. | False |
|
||||
| `--disable-chunked-prefix-cache` | Disable chunked prefix cache. | False |
|
||||
| `--disable-fast-image-processor` | Disable fast image processor. | False |
|
||||
| `--enable-return-hidden-states` | Enable returning hidden states. | False |
|
||||
|
||||
## Debug tensor dumps
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--debug-tensor-dump-output-folder` | The output folder for debug tensor dumps. | None |
|
||||
| `--debug-tensor-dump-input-file` | The input file for debug tensor dumps. | None |
|
||||
| `--debug-tensor-dump-inject` | Enable injection of debug tensor dumps. | False |
|
||||
| `--debug-tensor-dump-prefill-only` | Enable prefill-only mode for debug tensor dumps. | False |
|
||||
|
||||
## PD disaggregation
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--disaggregation-mode` | PD disaggregation mode: "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only). | null |
|
||||
| `--disaggregation-transfer-backend` | The transfer backend for PD disaggregation. | mooncake |
|
||||
| `--disaggregation-bootstrap-port` | The bootstrap port for PD disaggregation. | 8998 |
|
||||
| `--disaggregation-decode-tp` | The decode TP for PD disaggregation. | None |
|
||||
| `--disaggregation-decode-dp` | The decode DP for PD disaggregation. | None |
|
||||
| `--disaggregation-prefill-pp` | The prefill PP for PD disaggregation. | 1 |
|
||||
|
||||
## Model weight update
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--custom-weight-loader` | Custom weight loader paths. | None |
|
||||
| `--weight-loader-disable-mmap` | Disable mmap for weight loader. | False |
|
||||
|
||||
## PD-Multiplexing
|
||||
|
||||
| Arguments | Description | Defaults |
|
||||
|-----------|-------------|----------|
|
||||
| `--enable-pdmux` | Enable PD-Multiplexing. | False |
|
||||
| `--sm-group-num` | Number of SM groups for PD-Multiplexing. | 3 |
|
||||
370
docs/advanced_features/speculative_decoding.ipynb
Normal file
370
docs/advanced_features/speculative_decoding.ipynb
Normal file
@@ -0,0 +1,370 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Speculative Decoding\n",
|
||||
"\n",
|
||||
"SGLang now provides an EAGLE-based (EAGLE-2/EAGLE-3) speculative decoding option. Our implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n",
|
||||
"\n",
|
||||
"### Performance Highlights\n",
|
||||
"\n",
|
||||
"Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n",
|
||||
"For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n",
|
||||
"\n",
|
||||
"| Method | Throughput (tokens/s) |\n",
|
||||
"|--------|----------------|\n",
|
||||
"| SGLang (w/o speculative, 1x H100) | 158.34 tokens/s |\n",
|
||||
"| SGLang + EAGLE-2 (1x H100) | 244.10 tokens/s |\n",
|
||||
"| SGLang + EAGLE-3 (1x H100) | 373.25 tokens/s |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## EAGLE Decoding\n",
|
||||
"\n",
|
||||
"To enable EAGLE speculative decoding the following parameters are relevant:\n",
|
||||
"* `speculative_draft_model_path`: Specifies draft model. This parameter is required.\n",
|
||||
"* `speculative_num_steps`: Depth of autoregressive drafting. Increases speculation range but risks rejection cascades. Default is 5.\n",
|
||||
"* `speculative_eagle_topk`: Branching factor per step. Improves candidate diversity, will lead to higher acceptance rate, but more lead to higher memory/compute consumption. Default is 4.\n",
|
||||
"* `speculative_num_draft_tokens`: Maximum parallel verification capacity. Allows deeper tree evaluation but will lead to higher GPU memory usage. Default is 8.\n",
|
||||
"\n",
|
||||
"These parameters are the same for EAGLE-2 and EAGLE-3.\n",
|
||||
"\n",
|
||||
"You can find the best combinations of these parameters with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py).\n",
|
||||
"\n",
|
||||
"In the documentation below, we set `--cuda-graph-max-bs` to be a small value for faster engine startup. For your own workloads, please tune the above parameters together with `--cuda-graph-max-bs`, `--max-running-requests`, `--mem-fraction-static` for the best performance. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EAGLE-2 decoding\n",
|
||||
"\n",
|
||||
"You can enable EAGLE-2 decoding by setting `--speculative-algorithm EAGLE` and choosing an appropriate model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"import openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
|
||||
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n",
|
||||
" --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Llama-2-7b-chat-hf\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EAGLE-2 Decoding with `torch.compile`\n",
|
||||
"\n",
|
||||
"You can also enable `torch.compile` for further optimizations and optionally set `--torch-compile-max-bs`:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
|
||||
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
|
||||
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
|
||||
" --enable-torch-compile --torch-compile-max-bs 2 --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Llama-2-7b-chat-hf\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EAGLE-2 Decoding via Frequency-Ranked Speculative Sampling\n",
|
||||
"\n",
|
||||
"By employing a truncated high-frequency token vocabulary in the draft model, Eagle speculative decoding reduces `lm_head` computational overhead while accelerating the pipeline without quality degradation. For more details, checkout [the paper](https://arxiv.org/pdf/arXiv:2502.14856).\n",
|
||||
"\n",
|
||||
"In our implementation, set `--speculative-token-map` to enable the optimization. You can get the high-frequency token in FR-Spec from [this model](https://huggingface.co/thunlp/LLaMA3-Instruct-8B-FR-Spec). Or you can obtain high-frequency token by directly downloading these token from [this repo](https://github.com/thunlp/FR-Spec/tree/main?tab=readme-ov-file#prepare-fr-spec-vocabulary-subset).\n",
|
||||
"\n",
|
||||
"Thanks for the contribution from [Weilin Zhao](https://github.com/Achazwl) and [Zhousx](https://github.com/Zhou-sx). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
|
||||
" --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
|
||||
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
|
||||
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EAGLE-3 Decoding\n",
|
||||
"\n",
|
||||
"You can enable EAGLE-3 decoding by setting `--speculative-algorithm EAGLE3` and choosing an appropriate model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algorithm EAGLE3 \\\n",
|
||||
" --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n",
|
||||
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n",
|
||||
" --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Multi Token Prediction\n",
|
||||
"\n",
|
||||
"We support [MTP(Multi-Token Prediction)](https://arxiv.org/pdf/2404.19737) in SGLang by using speculative decoding. We use Xiaomi/MiMo-7B-RL model as example here (deepseek mtp usage refer to [deepseek doc](../basic_usage/deepseek.md#multi-token-prediction))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
" python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n",
|
||||
" --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n",
|
||||
" --mem-fraction 0.5 --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/v1/chat/completions\"\n",
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" \"model\": \"XiaomiMiMo/MiMo-7B-RL\",\n",
|
||||
" \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"\n",
|
||||
"EAGLE process is as follows:\n",
|
||||
"\n",
|
||||
"- Within EAGLE the draft model predicts the next feature vector, i.e. the last hidden state of the original LLM, using the feature sequence $(f_1, ..., f_k)$ and the token sequence $(t_2, ..., t_{k+1})$. \n",
|
||||
"- The next token is then sampled from $p_{k+2}=\\text{LMHead}(f_{k+1})$. Afterwards, the two sequences are extended in a tree style—branching out multiple potential continuations, with the branching factor per step controlled by the `speculative_eagle_topk` parameter—to ensure a more coherent connection of context, and are given as input again.\n",
|
||||
"- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n",
|
||||
"- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n",
|
||||
"\n",
|
||||
"This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
853
docs/advanced_features/structured_outputs.ipynb
Normal file
853
docs/advanced_features/structured_outputs.ipynb
Normal file
@@ -0,0 +1,853 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Structured Outputs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can specify a JSON schema, [regular expression](https://en.wikipedia.org/wiki/Regular_expression) or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.\n",
|
||||
"\n",
|
||||
"SGLang supports three grammar backends:\n",
|
||||
"\n",
|
||||
"- [XGrammar](https://github.com/mlc-ai/xgrammar)(default): Supports JSON schema, regular expression, and EBNF constraints.\n",
|
||||
"- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.\n",
|
||||
"- [Llguidance](https://github.com/guidance-ai/llguidance): Supports JSON schema, regular expression, and EBNF constraints.\n",
|
||||
"\n",
|
||||
"We suggest using XGrammar for its better performance and utility. XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md). For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n",
|
||||
"\n",
|
||||
"To use Outlines, simply add `--grammar-backend outlines` when launching the server.\n",
|
||||
"To use llguidance, add `--grammar-backend llguidance` when launching the server.\n",
|
||||
"If no backend is specified, XGrammar will be used as the default.\n",
|
||||
"\n",
|
||||
"For better output quality, **It's advisable to explicitly include instructions in the prompt to guide the model to generate the desired format.** For example, you can specify, 'Please generate the output in the following JSON format: ...'.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## OpenAI Compatible API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### JSON\n",
|
||||
"\n",
|
||||
"you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Using Pydantic**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define the schema using Pydantic\n",
|
||||
"class CapitalInfo(BaseModel):\n",
|
||||
" name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
|
||||
" population: int = Field(..., description=\"Population of the capital city\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Please generate the information of the capital of France in the JSON format.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=128,\n",
|
||||
" response_format={\n",
|
||||
" \"type\": \"json_schema\",\n",
|
||||
" \"json_schema\": {\n",
|
||||
" \"name\": \"foo\",\n",
|
||||
" # convert the pydantic model to json schema\n",
|
||||
" \"schema\": CapitalInfo.model_json_schema(),\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response_content = response.choices[0].message.content\n",
|
||||
"# validate the JSON response by the pydantic model\n",
|
||||
"capital_info = CapitalInfo.model_validate_json(response_content)\n",
|
||||
"print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**JSON Schema Directly**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"json_schema = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
|
||||
" \"population\": {\"type\": \"integer\"},\n",
|
||||
" },\n",
|
||||
" \"required\": [\"name\", \"population\"],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=128,\n",
|
||||
" response_format={\n",
|
||||
" \"type\": \"json_schema\",\n",
|
||||
" \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EBNF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ebnf_grammar = \"\"\"\n",
|
||||
"root ::= city | description\n",
|
||||
"city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n",
|
||||
"description ::= city \" is \" status\n",
|
||||
"status ::= \"the capital of \" country\n",
|
||||
"country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Give me the information of the capital of France.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=32,\n",
|
||||
" extra_body={\"ebnf\": ebnf_grammar},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regular expression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=128,\n",
|
||||
" extra_body={\"regex\": \"(Paris|London)\"},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Structural Tag"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tool_get_current_weather = {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_current_weather\",\n",
|
||||
" \"description\": \"Get the current weather in a given location\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"city\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
|
||||
" },\n",
|
||||
" \"state\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
|
||||
" \" in, e.g. 'CA' which would mean 'California'\",\n",
|
||||
" },\n",
|
||||
" \"unit\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The unit to fetch the temperature in\",\n",
|
||||
" \"enum\": [\"celsius\", \"fahrenheit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" \"required\": [\"city\", \"state\", \"unit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"tool_get_current_date = {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_current_date\",\n",
|
||||
" \"description\": \"Get the current date and time for a given timezone\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"timezone\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The timezone to fetch the current date and time for, e.g. 'America/New_York'\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\"timezone\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"schema_get_current_weather = tool_get_current_weather[\"function\"][\"parameters\"]\n",
|
||||
"schema_get_current_date = tool_get_current_date[\"function\"][\"parameters\"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_messages():\n",
|
||||
" return [\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": f\"\"\"\n",
|
||||
"# Tool Instructions\n",
|
||||
"- Always execute python code in messages that you share.\n",
|
||||
"- When looking for real time information use relevant functions if available else fallback to brave_search\n",
|
||||
"You have access to the following functions:\n",
|
||||
"Use the function 'get_current_weather' to: Get the current weather in a given location\n",
|
||||
"{tool_get_current_weather[\"function\"]}\n",
|
||||
"Use the function 'get_current_date' to: Get the current date and time for a given timezone\n",
|
||||
"{tool_get_current_date[\"function\"]}\n",
|
||||
"If a you choose to call a function ONLY reply in the following format:\n",
|
||||
"<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}\n",
|
||||
"where\n",
|
||||
"start_tag => `<function`\n",
|
||||
"parameters => a JSON dict with the function argument name as key and function argument value as value.\n",
|
||||
"end_tag => `</function>`\n",
|
||||
"Here is an example,\n",
|
||||
"<function=example_function_name>{{\"example_name\": \"example_value\"}}</function>\n",
|
||||
"Reminder:\n",
|
||||
"- Function calls MUST follow the specified format\n",
|
||||
"- Required parameters MUST be specified\n",
|
||||
"- Only call one function at a time\n",
|
||||
"- Put the entire function call reply on one line\n",
|
||||
"- Always add your sources when using search results to answer the user query\n",
|
||||
"You are a helpful assistant.\"\"\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"You are in New York. Please get the current date and time, and the weather.\",\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"messages = get_messages()\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=messages,\n",
|
||||
" response_format={\n",
|
||||
" \"type\": \"structural_tag\",\n",
|
||||
" \"structures\": [\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_weather>\",\n",
|
||||
" \"schema\": schema_get_current_weather,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_date>\",\n",
|
||||
" \"schema\": schema_get_current_date,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"triggers\": [\"<function=\"],\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Native API and SGLang Runtime (SRT)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### JSON"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Using Pydantic**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define the schema using Pydantic\n",
|
||||
"class CapitalInfo(BaseModel):\n",
|
||||
" name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
|
||||
" population: int = Field(..., description=\"Population of the capital city\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Make API request\n",
|
||||
"messages = [\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
|
||||
" }\n",
|
||||
"]\n",
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 64,\n",
|
||||
" \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"print_highlight(response.json())\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"response_data = json.loads(response.json()[\"text\"])\n",
|
||||
"# validate the response by the pydantic model\n",
|
||||
"capital_info = CapitalInfo.model_validate(response_data)\n",
|
||||
"print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**JSON Schema Directly**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"json_schema = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
|
||||
" \"population\": {\"type\": \"integer\"},\n",
|
||||
" },\n",
|
||||
" \"required\": [\"name\", \"population\"],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# JSON\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 64,\n",
|
||||
" \"json_schema\": json_schema,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EBNF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Give me the information of the capital of France.\",\n",
|
||||
" }\n",
|
||||
"]\n",
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"max_new_tokens\": 128,\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"n\": 3,\n",
|
||||
" \"ebnf\": (\n",
|
||||
" \"root ::= city | description\\n\"\n",
|
||||
" 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
|
||||
" 'description ::= city \" is \" status\\n'\n",
|
||||
" 'status ::= \"the capital of \" country\\n'\n",
|
||||
" 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
|
||||
" ),\n",
|
||||
" },\n",
|
||||
" \"stream\": False,\n",
|
||||
" \"return_logprob\": False,\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regular expression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Paris is the capital of\",\n",
|
||||
" }\n",
|
||||
"]\n",
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 64,\n",
|
||||
" \"regex\": \"(France|England)\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Structural Tag"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"# generate an answer\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
|
||||
"\n",
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"payload = {\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"structural_tag\": json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"structural_tag\",\n",
|
||||
" \"structures\": [\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_weather>\",\n",
|
||||
" \"schema\": schema_get_current_weather,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_date>\",\n",
|
||||
" \"schema\": schema_get_current_date,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"triggers\": [\"<function=\"],\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Send POST request to the API endpoint\n",
|
||||
"response = requests.post(f\"http://localhost:{port}/generate\", json=payload)\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Offline Engine API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sglang as sgl\n",
|
||||
"\n",
|
||||
"llm = sgl.Engine(\n",
|
||||
" model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", grammar_backend=\"xgrammar\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### JSON"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Using Pydantic**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"prompts = [\n",
|
||||
" \"Give me the information of the capital of China in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of France in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of Ireland in the JSON format.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define the schema using Pydantic\n",
|
||||
"class CapitalInfo(BaseModel):\n",
|
||||
" name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
|
||||
" population: int = Field(..., description=\"Population of the capital city\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0.1,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print_highlight(\"===============================\")\n",
|
||||
" print_highlight(f\"Prompt: {prompt}\") # validate the output by the pydantic model\n",
|
||||
" capital_info = CapitalInfo.model_validate_json(output[\"text\"])\n",
|
||||
" print_highlight(f\"Validated output: {capital_info.model_dump_json()}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**JSON Schema Directly**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Give me the information of the capital of China in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of France in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of Ireland in the JSON format.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"json_schema = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
|
||||
" \"population\": {\"type\": \"integer\"},\n",
|
||||
" },\n",
|
||||
" \"required\": [\"name\", \"population\"],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.1, \"top_p\": 0.95, \"json_schema\": json_schema}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print_highlight(\"===============================\")\n",
|
||||
" print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EBNF\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Give me the information of the capital of France.\",\n",
|
||||
" \"Give me the information of the capital of Germany.\",\n",
|
||||
" \"Give me the information of the capital of Italy.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0.8,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" \"ebnf\": (\n",
|
||||
" \"root ::= city | description\\n\"\n",
|
||||
" 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
|
||||
" 'description ::= city \" is \" status\\n'\n",
|
||||
" 'status ::= \"the capital of \" country\\n'\n",
|
||||
" 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print_highlight(\"===============================\")\n",
|
||||
" print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regular expression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Please provide information about London as a major global city:\",\n",
|
||||
" \"Please provide information about Paris as a major global city:\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print_highlight(\"===============================\")\n",
|
||||
" print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Structural Tag"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"prompts = [text]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0.8,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" \"structural_tag\": json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"structural_tag\",\n",
|
||||
" \"structures\": [\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_weather>\",\n",
|
||||
" \"schema\": schema_get_current_weather,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_date>\",\n",
|
||||
" \"schema\": schema_get_current_date,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"triggers\": [\"<function=\"],\n",
|
||||
" }\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Send POST request to the API endpoint\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print_highlight(\"===============================\")\n",
|
||||
" print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm.shutdown()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,830 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Structured Outputs For Reasoning Models\n",
|
||||
"\n",
|
||||
"When working with reasoning models that use special tokens like `<think>...</think>` to denote reasoning sections, you might want to allow free-form text within these sections while still enforcing grammar constraints on the rest of the output.\n",
|
||||
"\n",
|
||||
"SGLang provides a feature to disable grammar restrictions within reasoning sections. This is particularly useful for models that need to perform complex reasoning steps before providing a structured output.\n",
|
||||
"\n",
|
||||
"To enable this feature, use the `--reasoning-parser` flag which decide the think_end_token, such as `</think>`, when launching the server. You can also specify the reasoning parser using the `--reasoning-parser` flag.\n",
|
||||
"\n",
|
||||
"## Supported Models\n",
|
||||
"\n",
|
||||
"Currently, SGLang supports the following reasoning models:\n",
|
||||
"- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `<think>` and `</think>` tags.\n",
|
||||
"- [QwQ](https://huggingface.co/Qwen/QwQ-32B): The reasoning content is wrapped with `<think>` and `</think>` tags.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Usage\n",
|
||||
"\n",
|
||||
"## OpenAI Compatible API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Specify the `--grammar-backend`, `--reasoning-parser` option."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### JSON\n",
|
||||
"\n",
|
||||
"you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Using Pydantic**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define the schema using Pydantic\n",
|
||||
"class CapitalInfo(BaseModel):\n",
|
||||
" name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
|
||||
" population: int = Field(..., description=\"Population of the capital city\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"assistant\",\n",
|
||||
" \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=2048,\n",
|
||||
" response_format={\n",
|
||||
" \"type\": \"json_schema\",\n",
|
||||
" \"json_schema\": {\n",
|
||||
" \"name\": \"foo\",\n",
|
||||
" # convert the pydantic model to json schema\n",
|
||||
" \"schema\": CapitalInfo.model_json_schema(),\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\n",
|
||||
" f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**JSON Schema Directly**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"json_schema = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
|
||||
" \"population\": {\"type\": \"integer\"},\n",
|
||||
" },\n",
|
||||
" \"required\": [\"name\", \"population\"],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"assistant\",\n",
|
||||
" \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=2048,\n",
|
||||
" response_format={\n",
|
||||
" \"type\": \"json_schema\",\n",
|
||||
" \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\n",
|
||||
" f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EBNF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ebnf_grammar = \"\"\"\n",
|
||||
"root ::= city | description\n",
|
||||
"city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n",
|
||||
"description ::= city \" is \" status\n",
|
||||
"status ::= \"the capital of \" country\n",
|
||||
"country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n",
|
||||
" {\n",
|
||||
" \"role\": \"assistant\",\n",
|
||||
" \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=2048,\n",
|
||||
" extra_body={\"ebnf\": ebnf_grammar},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\n",
|
||||
" f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regular expression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"assistant\", \"content\": \"What is the capital of France?\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=2048,\n",
|
||||
" extra_body={\"regex\": \"(Paris|London)\"},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\n",
|
||||
" f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Structural Tag"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tool_get_current_weather = {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_current_weather\",\n",
|
||||
" \"description\": \"Get the current weather in a given location\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"city\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
|
||||
" },\n",
|
||||
" \"state\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
|
||||
" \" in, e.g. 'CA' which would mean 'California'\",\n",
|
||||
" },\n",
|
||||
" \"unit\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The unit to fetch the temperature in\",\n",
|
||||
" \"enum\": [\"celsius\", \"fahrenheit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" \"required\": [\"city\", \"state\", \"unit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"tool_get_current_date = {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_current_date\",\n",
|
||||
" \"description\": \"Get the current date and time for a given timezone\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"timezone\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The timezone to fetch the current date and time for, e.g. 'America/New_York'\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\"timezone\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"schema_get_current_weather = tool_get_current_weather[\"function\"][\"parameters\"]\n",
|
||||
"schema_get_current_date = tool_get_current_date[\"function\"][\"parameters\"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_messages():\n",
|
||||
" return [\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": f\"\"\"\n",
|
||||
"# Tool Instructions\n",
|
||||
"- Always execute python code in messages that you share.\n",
|
||||
"- When looking for real time information use relevant functions if available else fallback to brave_search\n",
|
||||
"You have access to the following functions:\n",
|
||||
"Use the function 'get_current_weather' to: Get the current weather in a given location\n",
|
||||
"{tool_get_current_weather[\"function\"]}\n",
|
||||
"Use the function 'get_current_date' to: Get the current date and time for a given timezone\n",
|
||||
"{tool_get_current_date[\"function\"]}\n",
|
||||
"If a you choose to call a function ONLY reply in the following format:\n",
|
||||
"<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}\n",
|
||||
"where\n",
|
||||
"start_tag => `<function`\n",
|
||||
"parameters => a JSON dict with the function argument name as key and function argument value as value.\n",
|
||||
"end_tag => `</function>`\n",
|
||||
"Here is an example,\n",
|
||||
"<function=example_function_name>{{\"example_name\": \"example_value\"}}</function>\n",
|
||||
"Reminder:\n",
|
||||
"- Function calls MUST follow the specified format\n",
|
||||
"- Required parameters MUST be specified\n",
|
||||
"- Only call one function at a time\n",
|
||||
"- Put the entire function call reply on one line\n",
|
||||
"- Always add your sources when using search results to answer the user query\n",
|
||||
"You are a helpful assistant.\"\"\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"assistant\",\n",
|
||||
" \"content\": \"You are in New York. Please get the current date and time, and the weather.\",\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"messages = get_messages()\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
|
||||
" messages=messages,\n",
|
||||
" response_format={\n",
|
||||
" \"type\": \"structural_tag\",\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"structures\": [\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_weather>\",\n",
|
||||
" \"schema\": schema_get_current_weather,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_date>\",\n",
|
||||
" \"schema\": schema_get_current_date,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"triggers\": [\"<function=\"],\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\n",
|
||||
" f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Native API and SGLang Runtime (SRT)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### JSON"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Using Pydantic**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define the schema using Pydantic\n",
|
||||
"class CapitalInfo(BaseModel):\n",
|
||||
" name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
|
||||
" population: int = Field(..., description=\"Population of the capital city\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\n",
|
||||
" \"role\": \"assistant\",\n",
|
||||
" \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
|
||||
" },\n",
|
||||
"]\n",
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"# Make API request\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"print(response.json())\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"reasoing_content = response.json()[\"text\"].split(\"</think>\")[0]\n",
|
||||
"content = response.json()[\"text\"].split(\"</think>\")[1]\n",
|
||||
"print_highlight(f\"reasoing_content: {reasoing_content}\\n\\ncontent: {content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**JSON Schema Directly**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"json_schema = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
|
||||
" \"population\": {\"type\": \"integer\"},\n",
|
||||
" },\n",
|
||||
" \"required\": [\"name\", \"population\"],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# JSON\n",
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"json_schema\": json_schema,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EBNF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": \"Give me the information of the capital of France.\",\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"n\": 3,\n",
|
||||
" \"ebnf\": (\n",
|
||||
" \"root ::= city | description\\n\"\n",
|
||||
" 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
|
||||
" 'description ::= city \" is \" status\\n'\n",
|
||||
" 'status ::= \"the capital of \" country\\n'\n",
|
||||
" 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
|
||||
" ),\n",
|
||||
" },\n",
|
||||
" \"stream\": False,\n",
|
||||
" \"return_logprob\": False,\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regular expression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": \"Paris is the capital of\",\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"regex\": \"(France|England)\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"print(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Structural Tag"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"payload = {\n",
|
||||
" \"text\": text,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"structural_tag\": json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"structural_tag\",\n",
|
||||
" \"structures\": [\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_weather>\",\n",
|
||||
" \"schema\": schema_get_current_weather,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_date>\",\n",
|
||||
" \"schema\": schema_get_current_date,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"triggers\": [\"<function=\"],\n",
|
||||
" }\n",
|
||||
" ),\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Send POST request to the API endpoint\n",
|
||||
"response = requests.post(f\"http://localhost:{port}/generate\", json=payload)\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Offline Engine API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sglang as sgl\n",
|
||||
"\n",
|
||||
"llm = sgl.Engine(\n",
|
||||
" model_path=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
|
||||
" reasoning_parser=\"deepseek-r1\",\n",
|
||||
" grammar_backend=\"xgrammar\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### JSON"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Using Pydantic**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"prompts = [\n",
|
||||
" \"Give me the information of the capital of China in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of France in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of Ireland in the JSON format.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define the schema using Pydantic\n",
|
||||
"class CapitalInfo(BaseModel):\n",
|
||||
" name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
|
||||
" population: int = Field(..., description=\"Population of the capital city\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print(\"===============================\")\n",
|
||||
" print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**JSON Schema Directly**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Give me the information of the capital of China in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of France in the JSON format.\",\n",
|
||||
" \"Give me the information of the capital of Ireland in the JSON format.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"json_schema = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
|
||||
" \"population\": {\"type\": \"integer\"},\n",
|
||||
" },\n",
|
||||
" \"required\": [\"name\", \"population\"],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0, \"max_new_tokens\": 2048, \"json_schema\": json_schema}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print(\"===============================\")\n",
|
||||
" print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EBNF\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Give me the information of the capital of France.\",\n",
|
||||
" \"Give me the information of the capital of Germany.\",\n",
|
||||
" \"Give me the information of the capital of Italy.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0.8,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" \"ebnf\": (\n",
|
||||
" \"root ::= city | description\\n\"\n",
|
||||
" 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
|
||||
" 'description ::= city \" is \" status\\n'\n",
|
||||
" 'status ::= \"the capital of \" country\\n'\n",
|
||||
" 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print(\"===============================\")\n",
|
||||
" print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regular expression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Please provide information about London as a major global city:\",\n",
|
||||
" \"Please provide information about Paris as a major global city:\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print(\"===============================\")\n",
|
||||
" print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=False, add_generation_prompt=True\n",
|
||||
")\n",
|
||||
"prompts = [text]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0.8,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" \"max_new_tokens\": 2048,\n",
|
||||
" \"structural_tag\": json.dumps(\n",
|
||||
" {\n",
|
||||
" \"type\": \"structural_tag\",\n",
|
||||
" \"structures\": [\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_weather>\",\n",
|
||||
" \"schema\": schema_get_current_weather,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"begin\": \"<function=get_current_date>\",\n",
|
||||
" \"schema\": schema_get_current_date,\n",
|
||||
" \"end\": \"</function>\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"triggers\": [\"<function=\"],\n",
|
||||
" }\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Send POST request to the API endpoint\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print(\"===============================\")\n",
|
||||
" print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm.shutdown()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
852
docs/advanced_features/tool_parser.ipynb
Normal file
852
docs/advanced_features/tool_parser.ipynb
Normal file
@@ -0,0 +1,852 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tool Parser\n",
|
||||
"\n",
|
||||
"This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Currently supported parsers:\n",
|
||||
"\n",
|
||||
"| Parser | Supported Models | Notes |\n",
|
||||
"|---|---|---|\n",
|
||||
"| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n",
|
||||
"| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n",
|
||||
"| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n",
|
||||
"| `qwen25` | Qwen 2.5 (e.g. `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`) and QwQ (i.e. `Qwen/QwQ-32B`) | For QwQ, reasoning parser can be enabled together with tool call parser. See [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html). |\n",
|
||||
"| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | |\n",
|
||||
"| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n",
|
||||
"| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n",
|
||||
"| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## OpenAI Compatible API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Launching the Server"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\" # qwen25\n",
|
||||
")\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that `--tool-call-parser` defines the parser used to interpret responses."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define Tools for Function Call\n",
|
||||
"Below is a Python snippet that shows how to define a tool as a dictionary. The dictionary includes a tool name, a description, and property defined Parameters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define tools\n",
|
||||
"tools = [\n",
|
||||
" {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_current_weather\",\n",
|
||||
" \"description\": \"Get the current weather in a given location\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"city\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
|
||||
" },\n",
|
||||
" \"state\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
|
||||
" \" in, e.g. 'CA' which would mean 'California'\",\n",
|
||||
" },\n",
|
||||
" \"unit\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The unit to fetch the temperature in\",\n",
|
||||
" \"enum\": [\"celsius\", \"fahrenheit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" \"required\": [\"city\", \"state\", \"unit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define Messages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_messages():\n",
|
||||
" return [\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"What's the weather like in Boston today? Output a reasoning before act, then use the tools to help you.\",\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"messages = get_messages()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Initialize the Client"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize OpenAI-like client\n",
|
||||
"client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
|
||||
"model_name = client.models.list().data[0].id"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Non-Streaming Request"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Non-streaming mode test\n",
|
||||
"response_non_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0,\n",
|
||||
" top_p=0.95,\n",
|
||||
" max_tokens=1024,\n",
|
||||
" stream=False, # Non-streaming\n",
|
||||
" tools=tools,\n",
|
||||
")\n",
|
||||
"print_highlight(\"Non-stream response:\")\n",
|
||||
"print_highlight(response_non_stream)\n",
|
||||
"print_highlight(\"==== content ====\")\n",
|
||||
"print_highlight(response_non_stream.choices[0].message.content)\n",
|
||||
"print_highlight(\"==== tool_calls ====\")\n",
|
||||
"print_highlight(response_non_stream.choices[0].message.tool_calls)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Handle Tools\n",
|
||||
"When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"name_non_stream = response_non_stream.choices[0].message.tool_calls[0].function.name\n",
|
||||
"arguments_non_stream = (\n",
|
||||
" response_non_stream.choices[0].message.tool_calls[0].function.arguments\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Final streamed function call name: {name_non_stream}\")\n",
|
||||
"print_highlight(f\"Final streamed function call arguments: {arguments_non_stream}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming Request"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Streaming mode test\n",
|
||||
"print_highlight(\"Streaming response:\")\n",
|
||||
"response_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0,\n",
|
||||
" top_p=0.95,\n",
|
||||
" max_tokens=1024,\n",
|
||||
" stream=True, # Enable streaming\n",
|
||||
" tools=tools,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"texts = \"\"\n",
|
||||
"tool_calls = []\n",
|
||||
"name = \"\"\n",
|
||||
"arguments = \"\"\n",
|
||||
"for chunk in response_stream:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" texts += chunk.choices[0].delta.content\n",
|
||||
" if chunk.choices[0].delta.tool_calls:\n",
|
||||
" tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(texts)\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Tool Call ====\")\n",
|
||||
"for tool_call in tool_calls:\n",
|
||||
" print_highlight(tool_call)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Handle Tools\n",
|
||||
"When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Parse and combine function call arguments\n",
|
||||
"arguments = []\n",
|
||||
"for tool_call in tool_calls:\n",
|
||||
" if tool_call.function.name:\n",
|
||||
" print_highlight(f\"Streamed function call name: {tool_call.function.name}\")\n",
|
||||
"\n",
|
||||
" if tool_call.function.arguments:\n",
|
||||
" arguments.append(tool_call.function.arguments)\n",
|
||||
"\n",
|
||||
"# Combine all fragments into a single JSON string\n",
|
||||
"full_arguments = \"\".join(arguments)\n",
|
||||
"print_highlight(f\"streamed function call arguments: {full_arguments}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define a Tool Function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This is a demonstration, define real function according to your usage.\n",
|
||||
"def get_current_weather(city: str, state: str, unit: \"str\"):\n",
|
||||
" return (\n",
|
||||
" f\"The weather in {city}, {state} is 85 degrees {unit}. It is \"\n",
|
||||
" \"partly cloudly, with highs in the 90's.\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"available_tools = {\"get_current_weather\": get_current_weather}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"### Execute the Tool"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages.append(response_non_stream.choices[0].message)\n",
|
||||
"\n",
|
||||
"# Call the corresponding tool function\n",
|
||||
"tool_call = messages[-1].tool_calls[0]\n",
|
||||
"tool_name = tool_call.function.name\n",
|
||||
"tool_to_call = available_tools[tool_name]\n",
|
||||
"result = tool_to_call(**(json.loads(tool_call.function.arguments)))\n",
|
||||
"print_highlight(f\"Function call result: {result}\")\n",
|
||||
"# messages.append({\"role\": \"tool\", \"content\": result, \"name\": tool_name})\n",
|
||||
"messages.append(\n",
|
||||
" {\n",
|
||||
" \"role\": \"tool\",\n",
|
||||
" \"tool_call_id\": tool_call.id,\n",
|
||||
" \"content\": str(result),\n",
|
||||
" \"name\": tool_name,\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Updated message history: {messages}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Send Results Back to Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"final_response = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0,\n",
|
||||
" top_p=0.95,\n",
|
||||
" stream=False,\n",
|
||||
" tools=tools,\n",
|
||||
")\n",
|
||||
"print_highlight(\"Non-stream response:\")\n",
|
||||
"print_highlight(final_response)\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(final_response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Native API and SGLang Runtime (SRT)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"# generate an answer\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct\")\n",
|
||||
"\n",
|
||||
"messages = get_messages()\n",
|
||||
"\n",
|
||||
"input = tokenizer.apply_chat_template(\n",
|
||||
" messages,\n",
|
||||
" tokenize=False,\n",
|
||||
" add_generation_prompt=True,\n",
|
||||
" tools=tools,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"gen_url = f\"http://localhost:{port}/generate\"\n",
|
||||
"gen_data = {\n",
|
||||
" \"text\": input,\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"skip_special_tokens\": False,\n",
|
||||
" \"max_new_tokens\": 1024,\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
|
||||
"print_highlight(\"==== Response ====\")\n",
|
||||
"print_highlight(gen_response)\n",
|
||||
"\n",
|
||||
"# parse the response\n",
|
||||
"parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
|
||||
"\n",
|
||||
"function_call_input = {\n",
|
||||
" \"text\": gen_response,\n",
|
||||
" \"tool_call_parser\": \"qwen25\",\n",
|
||||
" \"tools\": tools,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"function_call_response = requests.post(parse_url, json=function_call_input)\n",
|
||||
"function_call_response_json = function_call_response.json()\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print(function_call_response_json[\"normal_text\"])\n",
|
||||
"print_highlight(\"==== Calls ====\")\n",
|
||||
"print(\"function name: \", function_call_response_json[\"calls\"][0][\"name\"])\n",
|
||||
"print(\"function arguments: \", function_call_response_json[\"calls\"][0][\"parameters\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Offline Engine API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sglang as sgl\n",
|
||||
"from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
|
||||
"from sglang.srt.managers.io_struct import Tool, Function\n",
|
||||
"\n",
|
||||
"llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",
|
||||
"tokenizer = llm.tokenizer_manager.tokenizer\n",
|
||||
"input_ids = tokenizer.apply_chat_template(\n",
|
||||
" messages, tokenize=True, add_generation_prompt=True, tools=tools\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Note that for gpt-oss tool parser, adding \"no_stop_trim\": True\n",
|
||||
"# to make sure the tool call token <call> is not trimmed.\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"max_new_tokens\": 1024,\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"top_p\": 0.95,\n",
|
||||
" \"skip_special_tokens\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# 1) Offline generation\n",
|
||||
"result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
|
||||
"generated_text = result[\"text\"] # Assume there is only one prompt\n",
|
||||
"\n",
|
||||
"print_highlight(\"=== Offline Engine Output Text ===\")\n",
|
||||
"print_highlight(generated_text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# 2) Parse using FunctionCallParser\n",
|
||||
"def convert_dict_to_tool(tool_dict: dict) -> Tool:\n",
|
||||
" function_dict = tool_dict.get(\"function\", {})\n",
|
||||
" return Tool(\n",
|
||||
" type=tool_dict.get(\"type\", \"function\"),\n",
|
||||
" function=Function(\n",
|
||||
" name=function_dict.get(\"name\"),\n",
|
||||
" description=function_dict.get(\"description\"),\n",
|
||||
" parameters=function_dict.get(\"parameters\"),\n",
|
||||
" ),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"tools = [convert_dict_to_tool(raw_tool) for raw_tool in tools]\n",
|
||||
"\n",
|
||||
"parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
|
||||
"normal_text, calls = parser.parse_non_stream(generated_text)\n",
|
||||
"\n",
|
||||
"print_highlight(\"=== Parsing Result ===\")\n",
|
||||
"print(\"Normal text portion:\", normal_text)\n",
|
||||
"print_highlight(\"Function call portion:\")\n",
|
||||
"for call in calls:\n",
|
||||
" # call: ToolCallItem\n",
|
||||
" print_highlight(f\" - tool name: {call.name}\")\n",
|
||||
" print_highlight(f\" parameters: {call.parameters}\")\n",
|
||||
"\n",
|
||||
"# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm.shutdown()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tool Choice Mode\n",
|
||||
"\n",
|
||||
"SGLang supports OpenAI's `tool_choice` parameter to control when and which tools the model should call. This feature is implemented using EBNF (Extended Backus-Naur Form) grammar to ensure reliable tool calling behavior.\n",
|
||||
"\n",
|
||||
"### Supported Tool Choice Options\n",
|
||||
"\n",
|
||||
"- **`tool_choice=\"required\"`**: Forces the model to call at least one tool\n",
|
||||
"- **`tool_choice={\"type\": \"function\", \"function\": {\"name\": \"specific_function\"}}`**: Forces the model to call a specific function\n",
|
||||
"\n",
|
||||
"### Backend Compatibility\n",
|
||||
"\n",
|
||||
"Tool choice is fully supported with the **Xgrammar backend**, which is the default grammar backend (`--grammar-backend xgrammar`). However, it may not be fully supported with other backends such as `outlines`.\n",
|
||||
"\n",
|
||||
"### Example: Required Tool Choice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from openai import OpenAI\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"\n",
|
||||
"# Start a new server session for tool choice examples\n",
|
||||
"server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
|
||||
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"\n",
|
||||
")\n",
|
||||
"wait_for_server(f\"http://localhost:{port_tool_choice}\")\n",
|
||||
"\n",
|
||||
"# Initialize client for tool choice examples\n",
|
||||
"client_tool_choice = OpenAI(\n",
|
||||
" api_key=\"None\", base_url=f\"http://0.0.0.0:{port_tool_choice}/v1\"\n",
|
||||
")\n",
|
||||
"model_name_tool_choice = client_tool_choice.models.list().data[0].id\n",
|
||||
"\n",
|
||||
"# Example with tool_choice=\"required\" - forces the model to call a tool\n",
|
||||
"messages_required = [\n",
|
||||
" {\"role\": \"user\", \"content\": \"Hello, what is the capital of France?\"}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Define tools\n",
|
||||
"tools = [\n",
|
||||
" {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_current_weather\",\n",
|
||||
" \"description\": \"Get the current weather in a given location\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"city\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
|
||||
" },\n",
|
||||
" \"unit\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The unit to fetch the temperature in\",\n",
|
||||
" \"enum\": [\"celsius\", \"fahrenheit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" \"required\": [\"city\", \"unit\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"response_required = client_tool_choice.chat.completions.create(\n",
|
||||
" model=model_name_tool_choice,\n",
|
||||
" messages=messages_required,\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=1024,\n",
|
||||
" tools=tools,\n",
|
||||
" tool_choice=\"required\", # Force the model to call a tool\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\"Response with tool_choice='required':\")\n",
|
||||
"print(\"Content:\", response_required.choices[0].message.content)\n",
|
||||
"print(\"Tool calls:\", response_required.choices[0].message.tool_calls)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Example: Specific Function Choice\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example with specific function choice - forces the model to call a specific function\n",
|
||||
"messages_specific = [\n",
|
||||
" {\"role\": \"user\", \"content\": \"What are the most attactive places in France?\"}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"response_specific = client_tool_choice.chat.completions.create(\n",
|
||||
" model=model_name_tool_choice,\n",
|
||||
" messages=messages_specific,\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=1024,\n",
|
||||
" tools=tools,\n",
|
||||
" tool_choice={\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\"name\": \"get_current_weather\"},\n",
|
||||
" }, # Force the model to call the specific get_current_weather function\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(\"Response with specific function choice:\")\n",
|
||||
"print(\"Content:\", response_specific.choices[0].message.content)\n",
|
||||
"print(\"Tool calls:\", response_specific.choices[0].message.tool_calls)\n",
|
||||
"\n",
|
||||
"if response_specific.choices[0].message.tool_calls:\n",
|
||||
" tool_call = response_specific.choices[0].message.tool_calls[0]\n",
|
||||
" print_highlight(f\"Called function: {tool_call.function.name}\")\n",
|
||||
" print_highlight(f\"Arguments: {tool_call.function.arguments}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process_tool_choice)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pythonic Tool Call Format (Llama-3.2 / Llama-3.3 / Llama-4)\n",
|
||||
"\n",
|
||||
"Some Llama models (such as Llama-3.2-1B, Llama-3.2-3B, Llama-3.3-70B, and Llama-4) support a \"pythonic\" tool call format, where the model outputs function calls as Python code, e.g.:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\")]\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"- The output is a Python list of function calls, with arguments as Python literals (not JSON).\n",
|
||||
"- Multiple tool calls can be returned in the same list:\n",
|
||||
"```python\n",
|
||||
"[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\"),\n",
|
||||
" get_current_weather(city=\"New York\", state=\"NY\", unit=\"fahrenheit\")]\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"For more information, refer to Meta’s documentation on [Zero shot function calling](https://github.com/meta-llama/llama-models/blob/main/models/llama4/prompt_format.md#zero-shot-function-calling---system-message).\n",
|
||||
"\n",
|
||||
"Note that this feature is still under development on Blackwell.\n",
|
||||
"\n",
|
||||
"### How to enable\n",
|
||||
"- Launch the server with `--tool-call-parser pythonic`\n",
|
||||
"- You may also specify --chat-template with the improved template for the model (e.g., `--chat-template=examples/chat_template/tool_chat_template_llama4_pythonic.jinja`).\n",
|
||||
"This is recommended because the model expects a special prompt format to reliably produce valid pythonic tool call outputs. The template ensures that the prompt structure (e.g., special tokens, message boundaries like `<|eom|>`, and function call delimiters) matches what the model was trained or fine-tuned on. If you do not use the correct chat template, tool calling may fail or produce inconsistent results.\n",
|
||||
"\n",
|
||||
"#### Forcing Pythonic Tool Call Output Without a Chat Template\n",
|
||||
"If you don't want to specify a chat template, you must give the model extremely explicit instructions in your messages to enforce pythonic output. For example, for `Llama-3.2-1B-Instruct`, you need:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1 --log-level warning\" # llama-3.2-1b-instruct\n",
|
||||
")\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")\n",
|
||||
"\n",
|
||||
"tools = [\n",
|
||||
" {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_weather\",\n",
|
||||
" \"description\": \"Get the current weather for a given location.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"location\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The name of the city or location.\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\"location\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": {\n",
|
||||
" \"name\": \"get_tourist_attractions\",\n",
|
||||
" \"description\": \"Get a list of top tourist attractions for a given city.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"city\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"The name of the city to find attractions for.\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\"city\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_messages():\n",
|
||||
" return [\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": (\n",
|
||||
" \"You are a travel assistant. \"\n",
|
||||
" \"When asked to call functions, ALWAYS respond ONLY with a python list of function calls, \"\n",
|
||||
" \"using this format: [func_name1(param1=value1, param2=value2), func_name2(param=value)]. \"\n",
|
||||
" \"Do NOT use JSON, do NOT use variables, do NOT use any other format. \"\n",
|
||||
" \"Here is an example:\\n\"\n",
|
||||
" '[get_weather(location=\"Paris\"), get_tourist_attractions(city=\"Paris\")]'\n",
|
||||
" ),\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": (\n",
|
||||
" \"I'm planning a trip to Tokyo next week. What's the weather like and what are some top tourist attractions? \"\n",
|
||||
" \"Propose parallel tool calls at once, using the python list of function calls format as shown above.\"\n",
|
||||
" ),\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"messages = get_messages()\n",
|
||||
"\n",
|
||||
"client = openai.Client(base_url=f\"http://localhost:{port}/v1\", api_key=\"xxxxxx\")\n",
|
||||
"model_name = client.models.list().data[0].id\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"response_non_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0,\n",
|
||||
" top_p=0.9,\n",
|
||||
" stream=False, # Non-streaming\n",
|
||||
" tools=tools,\n",
|
||||
")\n",
|
||||
"print_highlight(\"Non-stream response:\")\n",
|
||||
"print_highlight(response_non_stream)\n",
|
||||
"\n",
|
||||
"response_stream = client.chat.completions.create(\n",
|
||||
" model=model_name,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0,\n",
|
||||
" top_p=0.9,\n",
|
||||
" stream=True,\n",
|
||||
" tools=tools,\n",
|
||||
")\n",
|
||||
"texts = \"\"\n",
|
||||
"tool_calls = []\n",
|
||||
"name = \"\"\n",
|
||||
"arguments = \"\"\n",
|
||||
"\n",
|
||||
"for chunk in response_stream:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" texts += chunk.choices[0].delta.content\n",
|
||||
" if chunk.choices[0].delta.tool_calls:\n",
|
||||
" tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
|
||||
"\n",
|
||||
"print_highlight(\"Streaming Response:\")\n",
|
||||
"print_highlight(\"==== Text ====\")\n",
|
||||
"print_highlight(texts)\n",
|
||||
"\n",
|
||||
"print_highlight(\"==== Tool Call ====\")\n",
|
||||
"for tool_call in tool_calls:\n",
|
||||
" print_highlight(tool_call)\n",
|
||||
"\n",
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Note:** \n",
|
||||
"> The model may still default to JSON if it was heavily finetuned on that format. Prompt engineering (including examples) is the only way to increase the chance of pythonic output if you are not using a chat template."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## How to support a new model?\n",
|
||||
"1. Update the TOOLS_TAG_LIST in sglang/srt/function_call_parser.py with the model’s tool tags. Currently supported tags include:\n",
|
||||
"```\n",
|
||||
"\tTOOLS_TAG_LIST = [\n",
|
||||
"\t “<|plugin|>“,\n",
|
||||
"\t “<function=“,\n",
|
||||
"\t “<tool_call>“,\n",
|
||||
"\t “<|python_tag|>“,\n",
|
||||
"\t “[TOOL_CALLS]”\n",
|
||||
"\t]\n",
|
||||
"```\n",
|
||||
"2. Create a new detector class in sglang/srt/function_call_parser.py that inherits from BaseFormatDetector. The detector should handle the model’s specific function call format. For example:\n",
|
||||
"```\n",
|
||||
" class NewModelDetector(BaseFormatDetector):\n",
|
||||
"```\n",
|
||||
"3. Add the new detector to the MultiFormatParser class that manages all the format detectors."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
325
docs/advanced_features/vlm_query.ipynb
Normal file
325
docs/advanced_features/vlm_query.ipynb
Normal file
@@ -0,0 +1,325 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Query Vision Language Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Querying Qwen-VL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply() # Run this first.\n",
|
||||
"\n",
|
||||
"model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
|
||||
"chat_template = \"qwen2-vl\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Lets create a prompt.\n",
|
||||
"\n",
|
||||
"from io import BytesIO\n",
|
||||
"import requests\n",
|
||||
"from PIL import Image\n",
|
||||
"\n",
|
||||
"from sglang.srt.parser.conversation import chat_templates\n",
|
||||
"\n",
|
||||
"image = Image.open(\n",
|
||||
" BytesIO(\n",
|
||||
" requests.get(\n",
|
||||
" \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
" ).content\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"conv = chat_templates[chat_template].copy()\n",
|
||||
"conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
|
||||
"conv.append_message(conv.roles[1], \"\")\n",
|
||||
"conv.image_data = [image]\n",
|
||||
"\n",
|
||||
"print(conv.get_prompt())\n",
|
||||
"image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Query via the offline Engine API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang import Engine\n",
|
||||
"\n",
|
||||
"llm = Engine(\n",
|
||||
" model_path=model_path, chat_template=chat_template, mem_fraction_static=0.8\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
|
||||
"print(out[\"text\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Query via the offline Engine API, but send precomputed embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Compute the image embeddings using Huggingface.\n",
|
||||
"\n",
|
||||
"from transformers import AutoProcessor\n",
|
||||
"from transformers import Qwen2_5_VLForConditionalGeneration\n",
|
||||
"\n",
|
||||
"processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
|
||||
"vision = (\n",
|
||||
" Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"processed_prompt = processor(\n",
|
||||
" images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
|
||||
")\n",
|
||||
"input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
|
||||
"precomputed_embeddings = vision(\n",
|
||||
" processed_prompt[\"pixel_values\"].cuda(), processed_prompt[\"image_grid_thw\"].cuda()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"mm_item = dict(\n",
|
||||
" modality=\"IMAGE\",\n",
|
||||
" image_grid_thw=processed_prompt[\"image_grid_thw\"],\n",
|
||||
" precomputed_embeddings=precomputed_embeddings,\n",
|
||||
")\n",
|
||||
"out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
|
||||
"print(out[\"text\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "10",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Querying Llama 4 (Vision)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "11",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply() # Run this first.\n",
|
||||
"\n",
|
||||
"model_path = \"meta-llama/Llama-4-Scout-17B-16E-Instruct\"\n",
|
||||
"chat_template = \"llama-4\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "12",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Lets create a prompt.\n",
|
||||
"\n",
|
||||
"from io import BytesIO\n",
|
||||
"import requests\n",
|
||||
"from PIL import Image\n",
|
||||
"\n",
|
||||
"from sglang.srt.parser.conversation import chat_templates\n",
|
||||
"\n",
|
||||
"image = Image.open(\n",
|
||||
" BytesIO(\n",
|
||||
" requests.get(\n",
|
||||
" \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
" ).content\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"conv = chat_templates[chat_template].copy()\n",
|
||||
"conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
|
||||
"conv.append_message(conv.roles[1], \"\")\n",
|
||||
"conv.image_data = [image]\n",
|
||||
"\n",
|
||||
"print(conv.get_prompt())\n",
|
||||
"print(f\"Image size: {image.size}\")\n",
|
||||
"\n",
|
||||
"image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "13",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Query via the offline Engine API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "14",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang.test.test_utils import is_in_ci\n",
|
||||
"\n",
|
||||
"if not is_in_ci():\n",
|
||||
" from sglang import Engine\n",
|
||||
"\n",
|
||||
" llm = Engine(\n",
|
||||
" model_path=model_path,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" enable_multimodal=True,\n",
|
||||
" mem_fraction_static=0.8,\n",
|
||||
" tp_size=4,\n",
|
||||
" attention_backend=\"fa3\",\n",
|
||||
" context_length=65536,\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "15",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if not is_in_ci():\n",
|
||||
" out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
|
||||
" print(out[\"text\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Query via the offline Engine API, but send precomputed embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if not is_in_ci():\n",
|
||||
" # Compute the image embeddings using Huggingface.\n",
|
||||
"\n",
|
||||
" from transformers import AutoProcessor\n",
|
||||
" from transformers import Llama4ForConditionalGeneration\n",
|
||||
"\n",
|
||||
" processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
|
||||
" model = Llama4ForConditionalGeneration.from_pretrained(\n",
|
||||
" model_path, torch_dtype=\"auto\"\n",
|
||||
" ).eval()\n",
|
||||
" vision = model.vision_model.cuda()\n",
|
||||
" multi_modal_projector = model.multi_modal_projector.cuda()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "18",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if not is_in_ci():\n",
|
||||
" processed_prompt = processor(\n",
|
||||
" images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
|
||||
" )\n",
|
||||
" print(f'{processed_prompt[\"pixel_values\"].shape=}')\n",
|
||||
" input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
|
||||
"\n",
|
||||
" image_outputs = vision(\n",
|
||||
" processed_prompt[\"pixel_values\"].to(\"cuda\"), output_hidden_states=False\n",
|
||||
" )\n",
|
||||
" image_features = image_outputs.last_hidden_state\n",
|
||||
" vision_flat = image_features.view(-1, image_features.size(-1))\n",
|
||||
" precomputed_embeddings = multi_modal_projector(vision_flat)\n",
|
||||
"\n",
|
||||
" mm_item = dict(modality=\"IMAGE\", precomputed_embeddings=precomputed_embeddings)\n",
|
||||
" out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
|
||||
" print(out[\"text\"])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"jupytext": {
|
||||
"cell_metadata_filter": "-all",
|
||||
"custom_cell_magics": "kql",
|
||||
"encoding": "# -*- coding: utf-8 -*-"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
228
docs/basic_usage/deepseek.md
Normal file
228
docs/basic_usage/deepseek.md
Normal file
@@ -0,0 +1,228 @@
|
||||
# DeepSeek Usage
|
||||
|
||||
SGLang provides many optimizations specifically designed for the DeepSeek models, making it the inference engine recommended by the official [DeepSeek team](https://github.com/deepseek-ai/DeepSeek-V3/tree/main?tab=readme-ov-file#62-inference-with-sglang-recommended) from Day 0.
|
||||
|
||||
This document outlines current optimizations for DeepSeek.
|
||||
For an overview of the implemented features see the completed [Roadmap](https://github.com/sgl-project/sglang/issues/2591).
|
||||
|
||||
## Launch DeepSeek V3.1/V3/R1 with SGLang
|
||||
|
||||
To run DeepSeek V3.1/V3/R1 models, the recommended settings are as follows:
|
||||
|
||||
| Weight Type | Configuration |
|
||||
|------------|-------------------|
|
||||
| **Full precision FP8**<br>*(recommended)* | 8 x H200 |
|
||||
| | 8 x MI300X |
|
||||
| | 2 x 8 x H100/800/20 |
|
||||
| | Xeon 6980P CPU |
|
||||
| **Full precision BF16** | 2 x 8 x H200 |
|
||||
| | 2 x 8 x MI300X |
|
||||
| | 4 x 8 x H100/800/20 |
|
||||
| | 4 x 8 x A100/A800 |
|
||||
| **Quantized weights (AWQ)** | 8 x H100/800/20 |
|
||||
| | 8 x A100/A800 |
|
||||
| **Quantized weights (int8)** | 16 x A100/800 |
|
||||
| | 32 x L40S |
|
||||
| | Xeon 6980P CPU |
|
||||
| | 2 x Atlas 800I A3 |
|
||||
|
||||
<style>
|
||||
.md-typeset__table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.md-typeset__table table {
|
||||
border-collapse: collapse;
|
||||
margin: 1em 0;
|
||||
border: 2px solid var(--md-typeset-table-color);
|
||||
table-layout: fixed;
|
||||
}
|
||||
|
||||
.md-typeset__table th {
|
||||
border: 1px solid var(--md-typeset-table-color);
|
||||
border-bottom: 2px solid var(--md-typeset-table-color);
|
||||
background-color: var(--md-default-bg-color--lighter);
|
||||
padding: 12px;
|
||||
}
|
||||
|
||||
.md-typeset__table td {
|
||||
border: 1px solid var(--md-typeset-table-color);
|
||||
padding: 12px;
|
||||
}
|
||||
|
||||
.md-typeset__table tr:nth-child(2n) {
|
||||
background-color: var(--md-default-bg-color--lightest);
|
||||
}
|
||||
</style>
|
||||
|
||||
Detailed commands for reference:
|
||||
|
||||
- [8 x H200](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended)
|
||||
- [8 x MI300X](../platforms/amd_gpu.md#running-deepseek-v3)
|
||||
- [2 x 8 x H200](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h208-nodes)
|
||||
- [4 x 8 x A100](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-four-a1008-nodes)
|
||||
- [8 x A100 (AWQ)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-8-a100a800-with-awq-quantization)
|
||||
- [16 x A100 (int8)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-16-a100a800-with-int8-quantization)
|
||||
- [32 x L40S (int8)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-32-l40s-with-int8-quantization)
|
||||
- [Xeon 6980P CPU](../platforms/cpu_server.md#example-running-deepseek-r1)
|
||||
- [2 x Atlas 800I A3 (int8)](../platforms/ascend_npu.md#running-deepseek-v3)
|
||||
|
||||
### Download Weights
|
||||
If you encounter errors when starting the server, ensure the weights have finished downloading. It's recommended to download them beforehand or restart multiple times until all weights are downloaded. Please refer to [DeepSeek V3](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base#61-inference-with-deepseek-infer-demo-example-only) official guide to download the weights.
|
||||
|
||||
### Launch with one node of 8 x H200
|
||||
Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#installation--launch).
|
||||
**Note that Deepseek V3 is already in FP8**, so we should not run it with any quantization arguments like `--quantization fp8 --kv-cache-dtype fp8_e5m2`.
|
||||
|
||||
### Running examples on Multi-node
|
||||
|
||||
- [Serving with two H20*8 nodes](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h208-nodes).
|
||||
|
||||
- [Serving with two H200*8 nodes and docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h2008-nodes-and-docker).
|
||||
|
||||
- [Serving with four A100*8 nodes](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-four-a1008-nodes).
|
||||
|
||||
## Optimizations
|
||||
|
||||
### Multi-head Latent Attention (MLA) Throughput Optimizations
|
||||
|
||||
**Description**: [MLA](https://arxiv.org/pdf/2405.04434) is an innovative attention mechanism introduced by the DeepSeek team, aimed at improving inference efficiency. SGLang has implemented specific optimizations for this, including:
|
||||
|
||||
- **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase.
|
||||
|
||||
- **MLA Attention Backends**: Currently SGLang supports different optimized MLA attention backends, including [FlashAttention3](https://github.com/Dao-AILab/flash-attention), [Flashinfer](https://docs.flashinfer.ai/api/mla.html), [FlashMLA](https://github.com/deepseek-ai/FlashMLA), [CutlassMLA](https://github.com/sgl-project/sglang/pull/5390), **TRTLLM MLA** (optimized for Blackwell architecture), and [Triton](https://github.com/triton-lang/triton) backends. The default FA3 provides good performance across wide workloads.
|
||||
|
||||
- **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption.
|
||||
|
||||
- **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes.
|
||||
|
||||
- **Chunked Prefix Cache**: Chunked prefix cache optimization can increase throughput by cutting prefix cache into chunks, processing them with multi-head attention and merging their states. Its improvement can be significant when doing chunked prefill on long sequences. Currently this optimization is only available for FlashAttention3 backend.
|
||||
|
||||
Overall, with these optimizations, we have achieved up to **7x** acceleration in output throughput compared to the previous version.
|
||||
|
||||
<p align="center">
|
||||
<img src="https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg" alt="Multi-head Latent Attention for DeepSeek Series Models">
|
||||
</p>
|
||||
|
||||
**Usage**: MLA optimization is enabled by default. For MLA models on Blackwell architecture (e.g., B200), the default backend is FlashInfer. To use the optimized TRTLLM MLA backend for prefill and decode operations, explicitly specify `--attention-backend trtllm_mla`.
|
||||
|
||||
**Reference**: Check [Blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [Slides](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/lmsys_1st_meetup_deepseek_mla.pdf) for more details.
|
||||
|
||||
### Data Parallelism Attention
|
||||
|
||||
**Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer. If you do not use DP attention, KV cache will be duplicated among all TP ranks.
|
||||
|
||||
<p align="center">
|
||||
<img src="https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg" alt="Data Parallelism Attention for DeepSeek Series Models">
|
||||
</p>
|
||||
|
||||
With data parallelism attention enabled, we have achieved up to **1.9x** decoding throughput improvement compared to the previous version.
|
||||
|
||||
<p align="center">
|
||||
<img src="https://lmsys.org/images/blog/sglang_v0_4/deepseek_coder_v2.svg" alt="Data Parallelism Attention Performance Comparison">
|
||||
</p>
|
||||
|
||||
**Usage**:
|
||||
- Append `--enable-dp-attention --tp 8 --dp 8` to the server arguments when using 8 H200 GPUs. This optimization improves peak throughput in high batch size scenarios where the server is limited by KV cache capacity. However, it is not recommended for low-latency, small-batch use cases.
|
||||
- DP and TP attention can be flexibly combined. For example, to deploy DeepSeek-V3/R1 on 2 nodes with 8 H100 GPUs each, you can specify `--enable-dp-attention --tp 16 --dp 2`. This configuration runs attention with 2 DP groups, each containing 8 TP GPUs.
|
||||
|
||||
**Reference**: Check [Blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models).
|
||||
|
||||
### Multi Node Tensor Parallelism
|
||||
|
||||
**Description**: For users with limited memory on a single node, SGLang supports serving DeepSeek Series Models, including DeepSeek V3, across multiple nodes using tensor parallelism. This approach partitions the model parameters across multiple GPUs or nodes to handle models that are too large for one node's memory.
|
||||
|
||||
**Usage**: Check [here](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-2-h208) for usage examples.
|
||||
|
||||
### Block-wise FP8
|
||||
|
||||
**Description**: SGLang implements block-wise FP8 quantization with two key optimizations:
|
||||
|
||||
- **Activation**: E4M3 format using per-token-per-128-channel sub-vector scales with online casting.
|
||||
|
||||
- **Weight**: Per-128x128-block quantization for better numerical stability.
|
||||
|
||||
- **DeepGEMM**: The [DeepGEMM](https://github.com/deepseek-ai/DeepGEMM) kernel library optimized for FP8 matrix multiplications.
|
||||
|
||||
**Usage**: The activation and weight optimization above are turned on by default for DeepSeek V3 models. DeepGEMM is enabled by default on NVIDIA Hopper GPUs and disabled by default on other devices. DeepGEMM can also be manually turned off by setting the environment variable `SGL_ENABLE_JIT_DEEPGEMM=0`.
|
||||
|
||||
Before serving the DeepSeek model, precompile the DeepGEMM kernels using:
|
||||
```bash
|
||||
python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
|
||||
```
|
||||
The precompilation process typically takes around 10 minutes to complete.
|
||||
|
||||
### Multi-token Prediction
|
||||
**Description**: SGLang implements DeepSeek V3 Multi-Token Prediction (MTP) based on [EAGLE speculative decoding](https://docs.sglang.ai/advanced_features/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved by **1.8x** for batch size 1 and **1.5x** for batch size 32 respectively on H200 TP8 setting.
|
||||
|
||||
**Usage**:
|
||||
Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
|
||||
```
|
||||
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --trust-remote-code --tp 8
|
||||
```
|
||||
- The best configuration for `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` can be searched with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py) script for given batch size. The minimum configuration is `--speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2`, which can achieve speedup for larger batch sizes.
|
||||
- FlashAttention3, FlashMLA, and Triton backend fully supports MTP usage. For FlashInfer backend (`--attention-backend flashinfer`) with speculative decoding,`--speculative-eagle-topk` parameter should be set to `1`. MTP support for the CutlassMLA and TRTLLM MLA backends are still under development.
|
||||
- To enable DeepSeek MTP for large batch sizes (>32), there are some parameters should be changed (Reference [this discussion](https://github.com/sgl-project/sglang/issues/4543#issuecomment-2737413756)):
|
||||
- Adjust `--max-running-requests` to a larger number. The default value is `32` for MTP. For larger batch sizes, you should increase this value beyond the default value.
|
||||
- Set `--cuda-graph-bs`. It's a list of batch sizes for cuda graph capture. The default captured batch sizes for speculative decoding is set [here](https://github.com/sgl-project/sglang/blob/49420741746c8f3e80e0eb17e7d012bfaf25793a/python/sglang/srt/model_executor/cuda_graph_runner.py#L126). You can include more batch sizes into it.
|
||||
|
||||
|
||||
### Reasoning Content for DeepSeek R1 & V3.1
|
||||
|
||||
See [Reasoning Parser](https://docs.sglang.ai/advanced_features/separate_reasoning.html) and [Thinking Parameter for DeepSeek V3.1](https://docs.sglang.ai/basic_usage/openai_api_completions.html#Example:-DeepSeek-V3-Models).
|
||||
|
||||
|
||||
### Function calling for DeepSeek Models
|
||||
|
||||
Add arguments `--tool-call-parser deepseekv3` and `--chat-template ./examples/chat_template/tool_chat_template_deepseekv3.jinja`(recommended) to enable this feature. For example (running on 1 * H20 node):
|
||||
|
||||
```
|
||||
python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --port 30000 --host 0.0.0.0 --mem-fraction-static 0.9 --tool-call-parser deepseekv3 --chat-template ./examples/chat_template/tool_chat_template_deepseekv3.jinja
|
||||
```
|
||||
|
||||
Sample Request:
|
||||
|
||||
```
|
||||
curl "http://127.0.0.1:30000/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324", "tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of an city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "Hows the weather like in Qingdao today"}]}'
|
||||
```
|
||||
|
||||
Expected Response
|
||||
|
||||
```
|
||||
{"id":"6501ef8e2d874006bf555bc80cddc7c5","object":"chat.completion","created":1745993638,"model":"deepseek-ai/DeepSeek-V3-0324","choices":[{"index":0,"message":{"role":"assistant","content":null,"reasoning_content":null,"tool_calls":[{"id":"0","index":null,"type":"function","function":{"name":"query_weather","arguments":"{\"city\": \"Qingdao\"}"}}]},"logprobs":null,"finish_reason":"tool_calls","matched_stop":null}],"usage":{"prompt_tokens":116,"total_tokens":138,"completion_tokens":22,"prompt_tokens_details":null}}
|
||||
|
||||
```
|
||||
Sample Streaming Request:
|
||||
```
|
||||
curl "http://127.0.0.1:30000/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324","stream":true,"tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of an city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "Hows the weather like in Qingdao today"}]}'
|
||||
```
|
||||
Expected Streamed Chunks (simplified for clarity):
|
||||
```
|
||||
data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"{\""}}]}}]}
|
||||
data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"city"}}]}}]}
|
||||
data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"\":\""}}]}}]}
|
||||
data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"Q"}}]}}]}
|
||||
data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"ing"}}]}}]}
|
||||
data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"dao"}}]}}]}
|
||||
data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"\"}"}}]}}]}
|
||||
data: {"choices":[{"delta":{"tool_calls":null}}], "finish_reason": "tool_calls"}
|
||||
data: [DONE]
|
||||
```
|
||||
The client needs to concatenate all arguments fragments to reconstruct the complete tool call:
|
||||
```
|
||||
{"city": "Qingdao"}
|
||||
```
|
||||
Important Notes:
|
||||
1. Use a lower `"temperature"` value for better results.
|
||||
2. To receive more consistent tool call results, it is recommended to use `--chat-template examples/chat_template/tool_chat_template_deepseekv3.jinja`. It provides an improved unified prompt.
|
||||
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Model loading is taking too long, and I'm encountering an NCCL timeout. What should I do?**
|
||||
|
||||
A: If you're experiencing extended model loading times and an NCCL timeout, you can try increasing the timeout duration. Add the argument `--dist-timeout 3600` when launching your model. This will set the timeout to one hour, which often resolves the issue.
|
||||
114
docs/basic_usage/gpt_oss.md
Normal file
114
docs/basic_usage/gpt_oss.md
Normal file
@@ -0,0 +1,114 @@
|
||||
# GPT OSS Usage
|
||||
|
||||
Please refer to [https://github.com/sgl-project/sglang/issues/8833](https://github.com/sgl-project/sglang/issues/8833).
|
||||
|
||||
## Responses API & Built-in Tools
|
||||
|
||||
### Responses API
|
||||
|
||||
GPT‑OSS is compatible with the OpenAI Responses API. Use `client.responses.create(...)` with `model`, `instructions`, `input`, and optional `tools` to enable built‑in tool use.
|
||||
|
||||
### Built-in Tools
|
||||
|
||||
GPT‑OSS can call built‑in tools for web search and Python execution. You can use the demo tool server or connect to external MCP tool servers.
|
||||
|
||||
#### Python Tool
|
||||
|
||||
- Executes short Python snippets for calculations, parsing, and quick scripts.
|
||||
- By default runs in a Docker-based sandbox. To run on the host, set `PYTHON_EXECUTION_BACKEND=UV` (this executes model-generated code locally; use with care).
|
||||
- Ensure Docker is available if you are not using the UV backend. It is recommended to run `docker pull python:3.11` in advance.
|
||||
|
||||
#### Web Search Tool
|
||||
|
||||
- Uses the Exa backend for web search.
|
||||
- Requires an Exa API key; set `EXA_API_KEY` in your environment. Create a key at `https://exa.ai`.
|
||||
|
||||
### Tool & Reasoning Parser
|
||||
|
||||
- We support OpenAI Reasoning and Tool Call parser, as well as our SGLang native api for tool call and reasoning. Refer to [reasoning parser](../advanced_features/separate_reasoning.ipynb) and [tool call parser](../advanced_features/function_calling.ipynb) for more details.
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- Use **Python 3.12** for the demo tools. And install the required `gpt-oss` packages.
|
||||
- The default demo integrates the web search tool (Exa backend) and a demo Python interpreter via Docker.
|
||||
- For search, set `EXA_API_KEY`. For Python execution, either have Docker available or set `PYTHON_EXECUTION_BACKEND=UV`.
|
||||
|
||||
Examples:
|
||||
```bash
|
||||
export EXA_API_KEY=YOUR_EXA_KEY
|
||||
# Optional: run Python tool locally instead of Docker (use with care)
|
||||
export PYTHON_EXECUTION_BACKEND=UV
|
||||
```
|
||||
|
||||
Launch the server with the demo tool server:
|
||||
|
||||
`python3 -m sglang.launch_server --model-path openai/gpt-oss-120b --tool-server demo --tp 2`
|
||||
|
||||
For production usage, sglang can act as an MCP client for multiple services. An [example tool server](https://github.com/openai/gpt-oss/tree/main/gpt-oss-mcp-server) is provided. Start the servers and point sglang to them:
|
||||
```bash
|
||||
mcp run -t sse browser_server.py:mcp
|
||||
mcp run -t sse python_server.py:mcp
|
||||
|
||||
python -m sglang.launch_server ... --tool-server ip-1:port-1,ip-2:port-2
|
||||
```
|
||||
The URLs should be MCP SSE servers that expose server information and well-documented tools. These tools are added to the system prompt so the model can use them.
|
||||
|
||||
### Quick Demo
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:30000/v1",
|
||||
api_key="sk-123456"
|
||||
)
|
||||
|
||||
tools = [
|
||||
{"type": "code_interpreter"},
|
||||
{"type": "web_search_preview"},
|
||||
]
|
||||
|
||||
# Test python tool
|
||||
response = client.responses.create(
|
||||
model="openai/gpt-oss-120b",
|
||||
instructions="You are a helfpul assistant, you could use python tool to execute code.",
|
||||
input="Use python tool to calculate the sum of 29138749187 and 29138749187", # 58,277,498,374
|
||||
tools=tools
|
||||
)
|
||||
print("====== test python tool ======")
|
||||
print(response.output_text)
|
||||
|
||||
# Test browser tool
|
||||
response = client.responses.create(
|
||||
model="openai/gpt-oss-120b",
|
||||
instructions="You are a helfpul assistant, you could use browser to search the web",
|
||||
input="Search the web for the latest news about Nvidia stock price",
|
||||
tools=tools
|
||||
)
|
||||
print("====== test browser tool ======")
|
||||
print(response.output_text)
|
||||
```
|
||||
|
||||
Example output:
|
||||
```
|
||||
====== test python tool ======
|
||||
The sum of 29,138,749,187 and 29,138,749,187 is **58,277,498,374**.
|
||||
====== test browser tool ======
|
||||
**Recent headlines on Nvidia (NVDA) stock**
|
||||
|
||||
| Date (2025) | Source | Key news points | Stock‑price detail |
|
||||
|-------------|--------|----------------|--------------------|
|
||||
| **May 13** | Reuters | The market data page shows Nvidia trading “higher” at **$116.61** with no change from the previous close. | **$116.61** – latest trade (delayed ≈ 15 min)【14†L34-L38】 |
|
||||
| **Aug 18** | CNBC | Morgan Stanley kept an **overweight** rating and lifted its price target to **$206** (up from $200), implying a 14 % upside from the Friday close. The firm notes Nvidia shares have already **jumped 34 % this year**. | No exact price quoted, but the article signals strong upside expectations【9†L27-L31】 |
|
||||
| **Aug 20** | The Motley Fool | Nvidia is set to release its Q2 earnings on Aug 27. The article lists the **current price of $175.36**, down 0.16 % on the day (as of 3:58 p.m. ET). | **$175.36** – current price on Aug 20【10†L12-L15】【10†L53-L57】 |
|
||||
|
||||
**What the news tells us**
|
||||
|
||||
* Nvidia’s share price has risen sharply this year – up roughly a third according to Morgan Stanley – and analysts are still raising targets (now $206).
|
||||
* The most recent market quote (Reuters, May 13) was **$116.61**, but the stock has surged since then, reaching **$175.36** by mid‑August.
|
||||
* Upcoming earnings on **Aug 27** are a focal point; both the Motley Fool and Morgan Stanley expect the results could keep the rally going.
|
||||
|
||||
**Bottom line:** Nvidia’s stock is on a strong upward trajectory in 2025, with price targets climbing toward $200‑$210 and the market price already near $175 as of late August.
|
||||
|
||||
```
|
||||
61
docs/basic_usage/llama4.md
Normal file
61
docs/basic_usage/llama4.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Llama4 Usage
|
||||
|
||||
[Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/MODEL_CARD.md) is Meta's latest generation of open-source LLM model with industry-leading performance.
|
||||
|
||||
SGLang has supported Llama 4 Scout (109B) and Llama 4 Maverick (400B) since [v0.4.5](https://github.com/sgl-project/sglang/releases/tag/v0.4.5).
|
||||
|
||||
Ongoing optimizations are tracked in the [Roadmap](https://github.com/sgl-project/sglang/issues/5118).
|
||||
|
||||
## Launch Llama 4 with SGLang
|
||||
|
||||
To serve Llama 4 models on 8xH100/H200 GPUs:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct --tp 8 --context-length 1000000
|
||||
```
|
||||
|
||||
### Configuration Tips
|
||||
|
||||
- **OOM Mitigation**: Adjust `--context-length` to avoid a GPU out-of-memory issue. For the Scout model, we recommend setting this value up to 1M on 8\*H100 and up to 2.5M on 8\*H200. For the Maverick model, we don't need to set context length on 8\*H200. When hybrid kv cache is enabled, `--context-length` can be set up to 5M on 8\*H100 and up to 10M on 8\*H200 for the Scout model.
|
||||
|
||||
- **Chat Template**: Add `--chat-template llama-4` for chat completion tasks.
|
||||
- **Enable Multi-Modal**: Add `--enable-multimodal` for multi-modal capabilities.
|
||||
- **Enable Hybrid-KVCache**: Add `--hybrid-kvcache-ratio` for hybrid kv cache. Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/6563)
|
||||
|
||||
|
||||
### EAGLE Speculative Decoding
|
||||
**Description**: SGLang has supported Llama 4 Maverick (400B) with [EAGLE speculative decoding](https://docs.sglang.ai/backend/speculative_decoding.html#EAGLE-Decoding).
|
||||
|
||||
**Usage**:
|
||||
Add arguments `--speculative-draft-model-path`, `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
|
||||
```
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct --speculative-algorithm EAGLE3 --speculative-draft-model-path nvidia/Llama-4-Maverick-17B-128E-Eagle3 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --trust-remote-code --tp 8 --context-length 1000000
|
||||
```
|
||||
|
||||
- **Note** The Llama 4 draft model *nvidia/Llama-4-Maverick-17B-128E-Eagle3* can only recognize conversations in chat mode.
|
||||
|
||||
## Benchmarking Results
|
||||
|
||||
### Accuracy Test with `lm_eval`
|
||||
|
||||
The accuracy on SGLang for both Llama4 Scout and Llama4 Maverick can match the [official benchmark numbers](https://ai.meta.com/blog/llama-4-multimodal-intelligence/).
|
||||
|
||||
Benchmark results on MMLU Pro dataset with 8*H100:
|
||||
| | Llama-4-Scout-17B-16E-Instruct | Llama-4-Maverick-17B-128E-Instruct |
|
||||
|--------------------|--------------------------------|-------------------------------------|
|
||||
| Official Benchmark | 74.3 | 80.5 |
|
||||
| SGLang | 75.2 | 80.7 |
|
||||
|
||||
Commands:
|
||||
|
||||
```bash
|
||||
# Llama-4-Scout-17B-16E-Instruct model
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct --port 30000 --tp 8 --mem-fraction-static 0.8 --context-length 65536
|
||||
lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Scout-17B-16E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
|
||||
|
||||
# Llama-4-Maverick-17B-128E-Instruct
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct --port 30000 --tp 8 --mem-fraction-static 0.8 --context-length 65536
|
||||
lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Maverick-17B-128E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
|
||||
```
|
||||
|
||||
Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/5092).
|
||||
497
docs/basic_usage/native_api.ipynb
Normal file
497
docs/basic_usage/native_api.ipynb
Normal file
@@ -0,0 +1,497 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# SGLang Native APIs\n",
|
||||
"\n",
|
||||
"Apart from the OpenAI compatible APIs, the SGLang Runtime also provides its native server APIs. We introduce the following APIs:\n",
|
||||
"\n",
|
||||
"- `/generate` (text generation model)\n",
|
||||
"- `/get_model_info`\n",
|
||||
"- `/get_server_info`\n",
|
||||
"- `/health`\n",
|
||||
"- `/health_generate`\n",
|
||||
"- `/flush_cache`\n",
|
||||
"- `/update_weights`\n",
|
||||
"- `/encode`(embedding model)\n",
|
||||
"- `/v1/rerank`(cross encoder rerank model)\n",
|
||||
"- `/classify`(reward model)\n",
|
||||
"- `/start_expert_distribution_record`\n",
|
||||
"- `/stop_expert_distribution_record`\n",
|
||||
"- `/dump_expert_distribution_record`\n",
|
||||
"- A full list of these APIs can be found at [http_server.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py)\n",
|
||||
"\n",
|
||||
"We mainly use `requests` to test these APIs in the following examples. You can also use `curl`.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launch A Server"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Generate (text generation model)\n",
|
||||
"Generate completions. This is similar to the `/v1/completions` in OpenAI API. Detailed parameters can be found in the [sampling parameters](sampling_params.md)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/generate\"\n",
|
||||
"data = {\"text\": \"What is the capital of France?\"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Get Model Info\n",
|
||||
"\n",
|
||||
"Get the information of the model.\n",
|
||||
"\n",
|
||||
"- `model_path`: The path/name of the model.\n",
|
||||
"- `is_generation`: Whether the model is used as generation model or embedding model.\n",
|
||||
"- `tokenizer_path`: The path/name of the tokenizer.\n",
|
||||
"- `preferred_sampling_params`: The default sampling params specified via `--preferred-sampling-params`. `None` is returned in this example as we did not explicitly configure it in server args.\n",
|
||||
"- `weight_version`: This field contains the version of the model weights. This is often used to track changes or updates to the model’s trained parameters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://localhost:{port}/get_model_info\"\n",
|
||||
"\n",
|
||||
"response = requests.get(url)\n",
|
||||
"response_json = response.json()\n",
|
||||
"print_highlight(response_json)\n",
|
||||
"assert response_json[\"model_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
|
||||
"assert response_json[\"is_generation\"] is True\n",
|
||||
"assert response_json[\"tokenizer_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
|
||||
"assert response_json[\"preferred_sampling_params\"] is None\n",
|
||||
"assert response_json.keys() == {\n",
|
||||
" \"model_path\",\n",
|
||||
" \"is_generation\",\n",
|
||||
" \"tokenizer_path\",\n",
|
||||
" \"preferred_sampling_params\",\n",
|
||||
" \"weight_version\",\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Get Server Info\n",
|
||||
"Gets the server information including CLI arguments, token limits, and memory pool sizes.\n",
|
||||
"- Note: `get_server_info` merges the following deprecated endpoints:\n",
|
||||
" - `get_server_args`\n",
|
||||
" - `get_memory_pool_size` \n",
|
||||
" - `get_max_total_num_tokens`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://localhost:{port}/get_server_info\"\n",
|
||||
"\n",
|
||||
"response = requests.get(url)\n",
|
||||
"print_highlight(response.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Health Check\n",
|
||||
"- `/health`: Check the health of the server.\n",
|
||||
"- `/health_generate`: Check the health of the server by generating one token."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://localhost:{port}/health_generate\"\n",
|
||||
"\n",
|
||||
"response = requests.get(url)\n",
|
||||
"print_highlight(response.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://localhost:{port}/health\"\n",
|
||||
"\n",
|
||||
"response = requests.get(url)\n",
|
||||
"print_highlight(response.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Flush Cache\n",
|
||||
"\n",
|
||||
"Flush the radix cache. It will be automatically triggered when the model weights are updated by the `/update_weights` API."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = f\"http://localhost:{port}/flush_cache\"\n",
|
||||
"\n",
|
||||
"response = requests.post(url)\n",
|
||||
"print_highlight(response.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Update Weights From Disk\n",
|
||||
"\n",
|
||||
"Update model weights from disk without restarting the server. Only applicable for models with the same architecture and parameter size.\n",
|
||||
"\n",
|
||||
"SGLang support `update_weights_from_disk` API for continuous evaluation during training (save checkpoint to disk and update weights from disk).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# successful update with same architecture and size\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
|
||||
"data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct\"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"print_highlight(response.text)\n",
|
||||
"assert response.json()[\"success\"] is True\n",
|
||||
"assert response.json()[\"message\"] == \"Succeeded to update model weights.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# failed update with different parameter size or wrong name\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
|
||||
"data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct-wrong\"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"response_json = response.json()\n",
|
||||
"print_highlight(response_json)\n",
|
||||
"assert response_json[\"success\"] is False\n",
|
||||
"assert response_json[\"message\"] == (\n",
|
||||
" \"Failed to get weights iterator: \"\n",
|
||||
" \"qwen/qwen2.5-0.5b-instruct-wrong\"\n",
|
||||
" \" (repository not found).\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Encode (embedding model)\n",
|
||||
"\n",
|
||||
"Encode text into embeddings. Note that this API is only available for [embedding models](openai_api_embeddings.ipynb) and will raise an error for generation models.\n",
|
||||
"Therefore, we launch a new server to server an embedding model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embedding_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
|
||||
" --host 0.0.0.0 --is-embedding --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# successful encode for embedding model\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/encode\"\n",
|
||||
"data = {\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"text\": \"Once upon a time\"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"response_json = response.json()\n",
|
||||
"print_highlight(f\"Text embedding (first 10): {response_json['embedding'][:10]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(embedding_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## v1/rerank (cross encoder rerank model)\n",
|
||||
"Rerank a list of documents given a query using a cross-encoder model. Note that this API is only available for cross encoder model like [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) with `attention-backend` `triton` and `torch_native`.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reranker_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n",
|
||||
" --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# compute rerank scores for query and documents\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/v1/rerank\"\n",
|
||||
"data = {\n",
|
||||
" \"model\": \"BAAI/bge-reranker-v2-m3\",\n",
|
||||
" \"query\": \"what is panda?\",\n",
|
||||
" \"documents\": [\n",
|
||||
" \"hi\",\n",
|
||||
" \"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.\",\n",
|
||||
" ],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"response_json = response.json()\n",
|
||||
"for item in response_json:\n",
|
||||
" print_highlight(f\"Score: {item['score']:.2f} - Document: '{item['document']}'\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(reranker_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Classify (reward model)\n",
|
||||
"\n",
|
||||
"SGLang Runtime also supports reward models. Here we use a reward model to classify the quality of pairwise generations."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
|
||||
"# This will be updated in the future.\n",
|
||||
"\n",
|
||||
"reward_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"PROMPT = (\n",
|
||||
" \"What is the range of the numeric output of a sigmoid node in a neural network?\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"RESPONSE1 = \"The output of a sigmoid node is bounded between -1 and 1.\"\n",
|
||||
"RESPONSE2 = \"The output of a sigmoid node is bounded between 0 and 1.\"\n",
|
||||
"\n",
|
||||
"CONVS = [\n",
|
||||
" [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE1}],\n",
|
||||
" [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE2}],\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
|
||||
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/classify\"\n",
|
||||
"data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
|
||||
"\n",
|
||||
"responses = requests.post(url, json=data).json()\n",
|
||||
"for response in responses:\n",
|
||||
" print_highlight(f\"reward: {response['embedding'][0]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(reward_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Capture expert selection distribution in MoE models\n",
|
||||
"\n",
|
||||
"SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization.\n",
|
||||
"\n",
|
||||
"*Note: We only print out the first 10 lines of the csv below for better readability. Please adjust accordingly if you want to analyze the results more deeply.*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"expert_record_server_process, port = launch_server_cmd(\n",
|
||||
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(f\"http://localhost:{port}/start_expert_distribution_record\")\n",
|
||||
"print_highlight(response)\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/generate\"\n",
|
||||
"data = {\"text\": \"What is the capital of France?\"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"print_highlight(response.json())\n",
|
||||
"\n",
|
||||
"response = requests.post(f\"http://localhost:{port}/stop_expert_distribution_record\")\n",
|
||||
"print_highlight(response)\n",
|
||||
"\n",
|
||||
"response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
|
||||
"print_highlight(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(expert_record_server_process)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
235
docs/basic_usage/offline_engine_api.ipynb
Normal file
235
docs/basic_usage/offline_engine_api.ipynb
Normal file
@@ -0,0 +1,235 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Offline Engine API\n",
|
||||
"\n",
|
||||
"SGLang provides a direct inference engine without the need for an HTTP server, especially for use cases where additional HTTP server adds unnecessary complexity or overhead. Here are two general use cases:\n",
|
||||
"\n",
|
||||
"- Offline Batch Inference\n",
|
||||
"- Custom Server on Top of the Engine\n",
|
||||
"\n",
|
||||
"This document focuses on the offline batch inference, demonstrating four different inference modes:\n",
|
||||
"\n",
|
||||
"- Non-streaming synchronous generation\n",
|
||||
"- Streaming synchronous generation\n",
|
||||
"- Non-streaming asynchronous generation\n",
|
||||
"- Streaming asynchronous generation\n",
|
||||
"\n",
|
||||
"Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py).\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Nest Asyncio\n",
|
||||
"Note that if you want to use **Offline Engine** in ipython or some other nested loop code, you need to add the following code:\n",
|
||||
"```python\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Advanced Usage\n",
|
||||
"\n",
|
||||
"The engine supports [vlm inference](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py) as well as [extracting hidden states](https://github.com/sgl-project/sglang/blob/main/examples/runtime/hidden_states). \n",
|
||||
"\n",
|
||||
"Please see [the examples](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine) for further use cases."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Offline Batch Inference\n",
|
||||
"\n",
|
||||
"SGLang offline engine supports batch inference with efficient scheduling."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# launch the offline engine\n",
|
||||
"import asyncio\n",
|
||||
"\n",
|
||||
"import sglang as sgl\n",
|
||||
"import sglang.test.doc_patch\n",
|
||||
"from sglang.utils import async_stream_and_merge, stream_and_merge\n",
|
||||
"\n",
|
||||
"llm = sgl.Engine(model_path=\"qwen/qwen2.5-0.5b-instruct\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Non-streaming Synchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The president of the United States is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print(\"===============================\")\n",
|
||||
" print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming Synchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
|
||||
" \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
|
||||
" \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0.2,\n",
|
||||
" \"top_p\": 0.9,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
|
||||
"\n",
|
||||
"for prompt in prompts:\n",
|
||||
" print(f\"Prompt: {prompt}\")\n",
|
||||
" merged_output = stream_and_merge(llm, prompt, sampling_params)\n",
|
||||
" print(\"Generated text:\", merged_output)\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Non-streaming Asynchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
|
||||
" \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
|
||||
" \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Testing asynchronous batch generation ===\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def main():\n",
|
||||
" outputs = await llm.async_generate(prompts, sampling_params)\n",
|
||||
"\n",
|
||||
" for prompt, output in zip(prompts, outputs):\n",
|
||||
" print(f\"\\nPrompt: {prompt}\")\n",
|
||||
" print(f\"Generated text: {output['text']}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"asyncio.run(main())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming Asynchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
|
||||
" \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
|
||||
" \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def main():\n",
|
||||
" for prompt in prompts:\n",
|
||||
" print(f\"\\nPrompt: {prompt}\")\n",
|
||||
" print(\"Generated text: \", end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
" # Replace direct calls to async_generate with our custom overlap-aware version\n",
|
||||
" async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n",
|
||||
" print(cleaned_chunk, end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
" print() # New line after each prompt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"asyncio.run(main())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm.shutdown()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
9
docs/basic_usage/openai_api.rst
Normal file
9
docs/basic_usage/openai_api.rst
Normal file
@@ -0,0 +1,9 @@
|
||||
OpenAI-Compatible APIs
|
||||
======================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
openai_api_completions.ipynb
|
||||
openai_api_vision.ipynb
|
||||
openai_api_embeddings.ipynb
|
||||
389
docs/basic_usage/openai_api_completions.ipynb
Normal file
389
docs/basic_usage/openai_api_completions.ipynb
Normal file
@@ -0,0 +1,389 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OpenAI APIs - Completions\n",
|
||||
"\n",
|
||||
"SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
|
||||
"A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
|
||||
"\n",
|
||||
"This tutorial covers the following popular APIs:\n",
|
||||
"\n",
|
||||
"- `chat/completions`\n",
|
||||
"- `completions`\n",
|
||||
"\n",
|
||||
"Check out other tutorials to learn about [vision APIs](openai_api_vision.ipynb) for vision-language models and [embedding APIs](openai_api_embeddings.ipynb) for embedding models."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launch A Server\n",
|
||||
"\n",
|
||||
"Launch the server in your terminal and wait for it to initialize."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")\n",
|
||||
"print(f\"Server started on http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Chat Completions\n",
|
||||
"\n",
|
||||
"### Usage\n",
|
||||
"\n",
|
||||
"The server fully implements the OpenAI API.\n",
|
||||
"It will automatically apply the chat template specified in the Hugging Face tokenizer, if one is available.\n",
|
||||
"You can also specify a custom chat template with `--chat-template` when launching the server."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Model Thinking/Reasoning Support\n",
|
||||
"\n",
|
||||
"Some models support internal reasoning or thinking processes that can be exposed in the API response. SGLang provides unified support for various reasoning models through the `chat_template_kwargs` parameter and compatible reasoning parsers.\n",
|
||||
"\n",
|
||||
"#### Supported Models and Configuration\n",
|
||||
"\n",
|
||||
"| Model Family | Chat Template Parameter | Reasoning Parser | Notes |\n",
|
||||
"|--------------|------------------------|------------------|--------|\n",
|
||||
"| DeepSeek-R1 (R1, R1-0528, R1-Distill) | `enable_thinking` | `--reasoning-parser deepseek-r1` | Standard reasoning models |\n",
|
||||
"| DeepSeek-V3.1 | `thinking` | `--reasoning-parser deepseek-v3` | Hybrid model (thinking/non-thinking modes) |\n",
|
||||
"| Qwen3 (standard) | `enable_thinking` | `--reasoning-parser qwen3` | Hybrid model (thinking/non-thinking modes) |\n",
|
||||
"| Qwen3-Thinking | N/A (always enabled) | `--reasoning-parser qwen3-thinking` | Always generates reasoning |\n",
|
||||
"| Kimi | N/A (always enabled) | `--reasoning-parser kimi` | Kimi thinking models |\n",
|
||||
"| Gpt-Oss | N/A (always enabled) | `--reasoning-parser gpt-oss` | Gpt-Oss thinking models |\n",
|
||||
"\n",
|
||||
"#### Basic Usage\n",
|
||||
"\n",
|
||||
"To enable reasoning output, you need to:\n",
|
||||
"1. Launch the server with the appropriate reasoning parser\n",
|
||||
"2. Set the model-specific parameter in `chat_template_kwargs`\n",
|
||||
"3. Optionally use `separate_reasoning: False` to not get reasoning content separately (default to `True`)\n",
|
||||
"\n",
|
||||
"**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. Use `--reasoning-parser qwen3-thinking` or `--reasoning-parser qwen3` to parse the thinking content.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example: Qwen3 Models\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"# Launch server:\n",
|
||||
"# python3 -m sglang.launch_server --model Qwen/Qwen3-4B --reasoning-parser qwen3\n",
|
||||
"\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"client = OpenAI(\n",
|
||||
" api_key=\"EMPTY\",\n",
|
||||
" base_url=f\"http://127.0.0.1:30000/v1\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model = \"Qwen/Qwen3-4B\"\n",
|
||||
"messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=model,\n",
|
||||
" messages=messages,\n",
|
||||
" extra_body={\n",
|
||||
" \"chat_template_kwargs\": {\"enable_thinking\": True},\n",
|
||||
" \"separate_reasoning\": True\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
|
||||
"print(\"-\"*100)\n",
|
||||
"print(\"Answer:\", response.choices[0].message.content)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**ExampleOutput:**\n",
|
||||
"```\n",
|
||||
"Reasoning: Okay, so the user is asking how many 'r's are in the word 'strawberry'. Let me think. First, I need to make sure I have the word spelled correctly. Strawberry... S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me break it down.\n",
|
||||
"\n",
|
||||
"Starting with 'strawberry', let's write out the letters one by one. S, T, R, A, W, B, E, R, R, Y. Hmm, wait, that's 10 letters. Let me check again. S (1), T (2), R (3), A (4), W (5), B (6), E (7), R (8), R (9), Y (10). So the letters are S-T-R-A-W-B-E-R-R-Y. \n",
|
||||
"...\n",
|
||||
"Therefore, the answer should be three R's in 'strawberry'. But I need to make sure I'm not counting any other letters as R. Let me check again. S, T, R, A, W, B, E, R, R, Y. No other R's. So three in total. Yeah, that seems right.\n",
|
||||
"\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Answer: The word \"strawberry\" contains **three** letters 'r'. Here's the breakdown:\n",
|
||||
"\n",
|
||||
"1. **S-T-R-A-W-B-E-R-R-Y** \n",
|
||||
" - The **third letter** is 'R'. \n",
|
||||
" - The **eighth and ninth letters** are also 'R's. \n",
|
||||
"\n",
|
||||
"Thus, the total count is **3**. \n",
|
||||
"\n",
|
||||
"**Answer:** 3.\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Note:** Setting `\"enable_thinking\": False` (or omitting it) will result in `reasoning_content` being `None`. Qwen3-Thinking models always generate reasoning content and don't support the `enable_thinking` parameter.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example: DeepSeek-V3 Models\n",
|
||||
"\n",
|
||||
"DeepSeek-V3 models support thinking mode through the `thinking` parameter:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"# Launch server:\n",
|
||||
"# python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.1 --tp 8 --reasoning-parser deepseek-v3\n",
|
||||
"\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"client = OpenAI(\n",
|
||||
" api_key=\"EMPTY\",\n",
|
||||
" base_url=f\"http://127.0.0.1:30000/v1\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model = \"deepseek-ai/DeepSeek-V3.1\"\n",
|
||||
"messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=model,\n",
|
||||
" messages=messages,\n",
|
||||
" extra_body={\n",
|
||||
" \"chat_template_kwargs\": {\"thinking\": True},\n",
|
||||
" \"separate_reasoning\": True\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
|
||||
"print(\"-\"*100)\n",
|
||||
"print(\"Answer:\", response.choices[0].message.content)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Example Output:**\n",
|
||||
"```\n",
|
||||
"Reasoning: First, the question is: \"How many r's are in 'strawberry'?\"\n",
|
||||
"\n",
|
||||
"I need to count the number of times the letter 'r' appears in the word \"strawberry\".\n",
|
||||
"\n",
|
||||
"Let me write out the word: S-T-R-A-W-B-E-R-R-Y.\n",
|
||||
"\n",
|
||||
"Now, I'll go through each letter and count the 'r's.\n",
|
||||
"...\n",
|
||||
"So, I have three 'r's in \"strawberry\".\n",
|
||||
"\n",
|
||||
"I should double-check. The word is spelled S-T-R-A-W-B-E-R-R-Y. The letters are at positions: 3, 8, and 9 are 'r's. Yes, that's correct.\n",
|
||||
"\n",
|
||||
"Therefore, the answer should be 3.\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Answer: The word \"strawberry\" contains **3** instances of the letter \"r\". Here's a breakdown for clarity:\n",
|
||||
"\n",
|
||||
"- The word is spelled: S-T-R-A-W-B-E-R-R-Y\n",
|
||||
"- The \"r\" appears at the 3rd, 8th, and 9th positions.\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Note:** DeepSeek-V3 models use the `thinking` parameter (not `enable_thinking`) to control reasoning output.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Parameters\n",
|
||||
"\n",
|
||||
"The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
|
||||
"\n",
|
||||
"SGLang extends the standard API with the `extra_body` parameter, allowing for additional customization. One key option within `extra_body` is `chat_template_kwargs`, which can be used to pass arguments to the chat template processor."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
|
||||
" },\n",
|
||||
" {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
|
||||
" {\n",
|
||||
" \"role\": \"assistant\",\n",
|
||||
" \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
|
||||
" },\n",
|
||||
" {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0.3, # Lower temperature for more focused responses\n",
|
||||
" max_tokens=128, # Reasonable length for a concise response\n",
|
||||
" top_p=0.95, # Slightly higher for better fluency\n",
|
||||
" presence_penalty=0.2, # Mild penalty to avoid repetition\n",
|
||||
" frequency_penalty=0.2, # Mild penalty for more natural language\n",
|
||||
" n=1, # Single response is usually more stable\n",
|
||||
" seed=42, # Keep for reproducibility\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Streaming mode is also supported."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"stream = client.chat.completions.create(\n",
|
||||
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": \"Say this is a test\"}],\n",
|
||||
" stream=True,\n",
|
||||
")\n",
|
||||
"for chunk in stream:\n",
|
||||
" if chunk.choices[0].delta.content is not None:\n",
|
||||
" print(chunk.choices[0].delta.content, end=\"\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Completions\n",
|
||||
"\n",
|
||||
"### Usage\n",
|
||||
"Completions API is similar to Chat Completions API, but without the `messages` parameter or chat templates."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = client.completions.create(\n",
|
||||
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" prompt=\"List 3 countries and their capitals.\",\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
" n=1,\n",
|
||||
" stop=None,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Parameters\n",
|
||||
"\n",
|
||||
"The completions API accepts OpenAI Completions API's parameters. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
|
||||
"\n",
|
||||
"Here is an example of a detailed completions request:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = client.completions.create(\n",
|
||||
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" prompt=\"Write a short story about a space explorer.\",\n",
|
||||
" temperature=0.7, # Moderate temperature for creative writing\n",
|
||||
" max_tokens=150, # Longer response for a story\n",
|
||||
" top_p=0.9, # Balanced diversity in word choice\n",
|
||||
" stop=[\"\\n\\n\", \"THE END\"], # Multiple stop sequences\n",
|
||||
" presence_penalty=0.3, # Encourage novel elements\n",
|
||||
" frequency_penalty=0.3, # Reduce repetitive phrases\n",
|
||||
" n=1, # Generate one completion\n",
|
||||
" seed=123, # For reproducible results\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Structured Outputs (JSON, Regex, EBNF)\n",
|
||||
"\n",
|
||||
"For OpenAI compatible structured outputs API, refer to [Structured Outputs](../advanced_features/structured_outputs.ipynb) for more details.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
195
docs/basic_usage/openai_api_embeddings.ipynb
Normal file
195
docs/basic_usage/openai_api_embeddings.ipynb
Normal file
@@ -0,0 +1,195 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OpenAI APIs - Embedding\n",
|
||||
"\n",
|
||||
"SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
|
||||
"A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/embeddings).\n",
|
||||
"\n",
|
||||
"This tutorial covers the embedding APIs for embedding models. For a list of the supported models see the [corresponding overview page](../supported_models/embedding_models.md)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launch A Server\n",
|
||||
"\n",
|
||||
"Launch the server in your terminal and wait for it to initialize. Remember to add `--is-embedding` to the command."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"embedding_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
|
||||
" --host 0.0.0.0 --is-embedding --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using cURL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import subprocess, json\n",
|
||||
"\n",
|
||||
"text = \"Once upon a time\"\n",
|
||||
"\n",
|
||||
"curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
|
||||
" -H \"Content-Type: application/json\" \\\n",
|
||||
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
|
||||
"\n",
|
||||
"result = subprocess.check_output(curl_text, shell=True)\n",
|
||||
"\n",
|
||||
"print(result)\n",
|
||||
"\n",
|
||||
"text_embedding = json.loads(result)[\"data\"][0][\"embedding\"]\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Python Requests"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"text = \"Once upon a time\"\n",
|
||||
"\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/v1/embeddings\",\n",
|
||||
" json={\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": text},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using OpenAI Python Client"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"# Text embedding example\n",
|
||||
"response = client.embeddings.create(\n",
|
||||
" model=\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\",\n",
|
||||
" input=text,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"embedding = response.data[0].embedding[:10]\n",
|
||||
"print_highlight(f\"Text embedding (first 10): {embedding}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Input IDs\n",
|
||||
"\n",
|
||||
"SGLang also supports `input_ids` as input to get the embedding."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\")\n",
|
||||
"input_ids = tokenizer.encode(text)\n",
|
||||
"\n",
|
||||
"curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
|
||||
" -H \"Content-Type: application/json\" \\\n",
|
||||
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
|
||||
"\n",
|
||||
"input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
|
||||
" 0\n",
|
||||
"][\"embedding\"]\n",
|
||||
"\n",
|
||||
"print_highlight(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(embedding_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Multi-Modal Embedding Model\n",
|
||||
"Please refer to [Multi-Modal Embedding Model](../supported_models/embedding_models.md)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
254
docs/basic_usage/openai_api_vision.ipynb
Normal file
254
docs/basic_usage/openai_api_vision.ipynb
Normal file
@@ -0,0 +1,254 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OpenAI APIs - Vision\n",
|
||||
"\n",
|
||||
"SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
|
||||
"A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/vision).\n",
|
||||
"This tutorial covers the vision APIs for vision language models.\n",
|
||||
"\n",
|
||||
"SGLang supports various vision language models such as Llama 3.2, LLaVA-OneVision, Qwen2.5-VL, Gemma3 and [more](../supported_models/multimodal_language_models.md).\n",
|
||||
"\n",
|
||||
"As an alternative to the OpenAI API, you can also use the [SGLang offline engine](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launch A Server\n",
|
||||
"\n",
|
||||
"Launch the server in your terminal and wait for it to initialize."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"vision_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using cURL\n",
|
||||
"\n",
|
||||
"Once the server is up, you can send test requests using curl or requests."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import subprocess\n",
|
||||
"\n",
|
||||
"curl_command = f\"\"\"\n",
|
||||
"curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
|
||||
" -H \"Content-Type: application/json\" \\\\\n",
|
||||
" -d '{{\n",
|
||||
" \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
|
||||
" \"messages\": [\n",
|
||||
" {{\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": [\n",
|
||||
" {{\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" \"text\": \"What’s in this image?\"\n",
|
||||
" }},\n",
|
||||
" {{\n",
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {{\n",
|
||||
" \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
" }}\n",
|
||||
" }}\n",
|
||||
" ]\n",
|
||||
" }}\n",
|
||||
" ],\n",
|
||||
" \"max_tokens\": 300\n",
|
||||
" }}'\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"response = subprocess.check_output(curl_command, shell=True).decode()\n",
|
||||
"print_highlight(response)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"response = subprocess.check_output(curl_command, shell=True).decode()\n",
|
||||
"print_highlight(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Python Requests"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/v1/chat/completions\"\n",
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
|
||||
" \"messages\": [\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": [\n",
|
||||
" {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n",
|
||||
" {\n",
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {\n",
|
||||
" \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"max_tokens\": 300,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"print_highlight(response.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using OpenAI Python Client"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": [\n",
|
||||
" {\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" \"text\": \"What is in this image?\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {\n",
|
||||
" \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" max_tokens=300,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Multiple-Image Inputs\n",
|
||||
"\n",
|
||||
"The server also supports multiple images and interleaved text and images if the model supports it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": [\n",
|
||||
" {\n",
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {\n",
|
||||
" \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {\n",
|
||||
" \"url\": \"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" \"text\": \"I have two very different images. They are not related at all. \"\n",
|
||||
" \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(vision_process)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
27
docs/basic_usage/qwen3.md
Normal file
27
docs/basic_usage/qwen3.md
Normal file
@@ -0,0 +1,27 @@
|
||||
# Qwen3-Next Usage
|
||||
|
||||
SGLang has supported Qwen3-Next-80B-A3B-Instruct and Qwen3-Next-80B-A3B-Thinking since [this PR](https://github.com/sgl-project/sglang/pull/10233).
|
||||
|
||||
## Launch Qwen3-Next with SGLang
|
||||
|
||||
To serve Qwen3-Next models on 4xH100/H200 GPUs:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model Qwen/Qwen3-Next-80B-A3B-Instruct --tp 4
|
||||
```
|
||||
|
||||
### Configuration Tips
|
||||
- `--max-mamba-cache-size`: Adjust `--max-mamba-cache-size` to increase mamba cache space and max running requests capability. It will decrease KV cache space as a trade-off. You can adjust it according to workload.
|
||||
- `--mamba-ssm-dtype`: `bfloat16` or `float32`, use `bfloat16` to save mamba cache size and `float32` to get more accurate results. The default setting is `float32`.
|
||||
|
||||
### EAGLE Speculative Decoding
|
||||
**Description**: SGLang has supported Qwen3-Next models with [EAGLE speculative decoding](https://docs.sglang.ai/advanced_features/speculative_decoding.html#EAGLE-Decoding).
|
||||
|
||||
**Usage**:
|
||||
Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
|
||||
|
||||
``` bash
|
||||
python3 -m sglang.launch_server --model Qwen/Qwen3-Next-80B-A3B-Instruct --tp 4 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --speculative-algo NEXTN
|
||||
```
|
||||
|
||||
Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/10233).
|
||||
305
docs/basic_usage/sampling_params.md
Normal file
305
docs/basic_usage/sampling_params.md
Normal file
@@ -0,0 +1,305 @@
|
||||
# Sampling Parameters
|
||||
|
||||
This doc describes the sampling parameters of the SGLang Runtime. It is the low-level endpoint of the runtime.
|
||||
If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](openai_api_completions.ipynb).
|
||||
|
||||
## `/generate` Endpoint
|
||||
|
||||
The `/generate` endpoint accepts the following parameters in JSON format. For detailed usage, see the [native API doc](native_api.ipynb). The object is defined at `io_struct.py::GenerateReqInput`. You can also read the source code to find more arguments and docs.
|
||||
|
||||
| Argument | Type/Default | Description |
|
||||
|----------------------------|------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| text | `Optional[Union[List[str], str]] = None` | The input prompt. Can be a single prompt or a batch of prompts. |
|
||||
| input_ids | `Optional[Union[List[List[int]], List[int]]] = None` | The token IDs for text; one can specify either text or input_ids. |
|
||||
| input_embeds | `Optional[Union[List[List[List[float]]], List[List[float]]]] = None` | The embeddings for input_ids; one can specify either text, input_ids, or input_embeds. |
|
||||
| image_data | `Optional[Union[List[List[ImageDataItem]], List[ImageDataItem], ImageDataItem]] = None` | The image input. Can be an image instance, file name, URL, or base64 encoded string. Can be a single image, list of images, or list of lists of images. |
|
||||
| audio_data | `Optional[Union[List[AudioDataItem], AudioDataItem]] = None` | The audio input. Can be a file name, URL, or base64 encoded string. |
|
||||
| sampling_params | `Optional[Union[List[Dict], Dict]] = None` | The sampling parameters as described in the sections below. |
|
||||
| rid | `Optional[Union[List[str], str]] = None` | The request ID. |
|
||||
| return_logprob | `Optional[Union[List[bool], bool]] = None` | Whether to return log probabilities for tokens. |
|
||||
| logprob_start_len | `Optional[Union[List[int], int]] = None` | If return_logprob, the start location in the prompt for returning logprobs. Default is "-1", which returns logprobs for output tokens only. |
|
||||
| top_logprobs_num | `Optional[Union[List[int], int]] = None` | If return_logprob, the number of top logprobs to return at each position. |
|
||||
| token_ids_logprob | `Optional[Union[List[List[int]], List[int]]] = None` | If return_logprob, the token IDs to return logprob for. |
|
||||
| return_text_in_logprobs | `bool = False` | Whether to detokenize tokens in text in the returned logprobs. |
|
||||
| stream | `bool = False` | Whether to stream output. |
|
||||
| lora_path | `Optional[Union[List[Optional[str]], Optional[str]]] = None` | The path to the LoRA. |
|
||||
| custom_logit_processor | `Optional[Union[List[Optional[str]], str]] = None` | Custom logit processor for advanced sampling control. Must be a serialized instance of `CustomLogitProcessor` using its `to_str()` method. For usage see below. |
|
||||
| return_hidden_states | `Union[List[bool], bool] = False` | Whether to return hidden states. |
|
||||
|
||||
## Sampling parameters
|
||||
|
||||
The object is defined at `sampling_params.py::SamplingParams`. You can also read the source code to find more arguments and docs.
|
||||
|
||||
### Core parameters
|
||||
|
||||
| Argument | Type/Default | Description |
|
||||
|-----------------|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| max_new_tokens | `int = 128` | The maximum output length measured in tokens. |
|
||||
| stop | `Optional[Union[str, List[str]]] = None` | One or multiple [stop words](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stop). Generation will stop if one of these words is sampled. |
|
||||
| stop_token_ids | `Optional[List[int]] = None` | Provide stop words in the form of token IDs. Generation will stop if one of these token IDs is sampled. |
|
||||
| temperature | `float = 1.0` | [Temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) when sampling the next token. `temperature = 0` corresponds to greedy sampling, a higher temperature leads to more diversity. |
|
||||
| top_p | `float = 1.0` | [Top-p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. |
|
||||
| top_k | `int = -1` | [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. |
|
||||
| min_p | `float = 0.0` | [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. |
|
||||
|
||||
### Penalizers
|
||||
|
||||
| Argument | Type/Default | Description |
|
||||
|--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| frequency_penalty | `float = 0.0` | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
|
||||
| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
|
||||
| min_new_tokens | `int = 0` | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |
|
||||
|
||||
### Constrained decoding
|
||||
|
||||
Please refer to our dedicated guide on [constrained decoding](../advanced_features/structured_outputs.ipynb) for the following parameters.
|
||||
|
||||
| Argument | Type/Default | Description |
|
||||
|-----------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| json_schema | `Optional[str] = None` | JSON schema for structured outputs. |
|
||||
| regex | `Optional[str] = None` | Regex for structured outputs. |
|
||||
| ebnf | `Optional[str] = None` | EBNF for structured outputs. |
|
||||
| structural_tag | `Optional[str] = None` | The structal tag for structured outputs. |
|
||||
|
||||
### Other options
|
||||
|
||||
| Argument | Type/Default | Description |
|
||||
|-------------------------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| n | `int = 1` | Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeating the same prompts several times offers better control and efficiency.) |
|
||||
| ignore_eos | `bool = False` | Don't stop generation when EOS token is sampled. |
|
||||
| skip_special_tokens | `bool = True` | Remove special tokens during decoding. |
|
||||
| spaces_between_special_tokens | `bool = True` | Whether or not to add spaces between special tokens during detokenization. |
|
||||
| no_stop_trim | `bool = False` | Don't trim stop words or EOS token from the generated text. |
|
||||
| custom_params | `Optional[List[Optional[Dict[str, Any]]]] = None` | Used when employing `CustomLogitProcessor`. For usage, see below. |
|
||||
|
||||
## Examples
|
||||
|
||||
### Normal
|
||||
|
||||
Launch a server:
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
|
||||
```
|
||||
|
||||
Send a request:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:30000/generate",
|
||||
json={
|
||||
"text": "The capital of France is",
|
||||
"sampling_params": {
|
||||
"temperature": 0,
|
||||
"max_new_tokens": 32,
|
||||
},
|
||||
},
|
||||
)
|
||||
print(response.json())
|
||||
```
|
||||
|
||||
Detailed example in [send request](./send_request.ipynb).
|
||||
|
||||
### Streaming
|
||||
|
||||
Send a request and stream the output:
|
||||
|
||||
```python
|
||||
import requests, json
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:30000/generate",
|
||||
json={
|
||||
"text": "The capital of France is",
|
||||
"sampling_params": {
|
||||
"temperature": 0,
|
||||
"max_new_tokens": 32,
|
||||
},
|
||||
"stream": True,
|
||||
},
|
||||
stream=True,
|
||||
)
|
||||
|
||||
prev = 0
|
||||
for chunk in response.iter_lines(decode_unicode=False):
|
||||
chunk = chunk.decode("utf-8")
|
||||
if chunk and chunk.startswith("data:"):
|
||||
if chunk == "data: [DONE]":
|
||||
break
|
||||
data = json.loads(chunk[5:].strip("\n"))
|
||||
output = data["text"].strip()
|
||||
print(output[prev:], end="", flush=True)
|
||||
prev = len(output)
|
||||
print("")
|
||||
```
|
||||
|
||||
Detailed example in [openai compatible api](openai_api_completions.ipynb).
|
||||
|
||||
### Multimodal
|
||||
|
||||
Launch a server:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov
|
||||
```
|
||||
|
||||
Download an image:
|
||||
|
||||
```bash
|
||||
curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true
|
||||
```
|
||||
|
||||
Send a request:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:30000/generate",
|
||||
json={
|
||||
"text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
"<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n"
|
||||
"<|im_start|>assistant\n",
|
||||
"image_data": "example_image.png",
|
||||
"sampling_params": {
|
||||
"temperature": 0,
|
||||
"max_new_tokens": 32,
|
||||
},
|
||||
},
|
||||
)
|
||||
print(response.json())
|
||||
```
|
||||
|
||||
The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
|
||||
|
||||
Streaming is supported in a similar manner as [above](#streaming).
|
||||
|
||||
Detailed example in [OpenAI API Vision](openai_api_vision.ipynb).
|
||||
|
||||
### Structured Outputs (JSON, Regex, EBNF)
|
||||
|
||||
You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
|
||||
|
||||
SGLang supports two grammar backends:
|
||||
|
||||
- [XGrammar](https://github.com/mlc-ai/xgrammar) (default): Supports JSON schema, regular expression, and EBNF constraints.
|
||||
- XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md).
|
||||
- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.
|
||||
|
||||
If instead you want to initialize the Outlines backend, you can use `--grammar-backend outlines` flag:
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: xgrammar)
|
||||
```
|
||||
|
||||
```python
|
||||
import json
|
||||
import requests
|
||||
|
||||
json_schema = json.dumps({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "pattern": "^[\\w]+$"},
|
||||
"population": {"type": "integer"},
|
||||
},
|
||||
"required": ["name", "population"],
|
||||
})
|
||||
|
||||
# JSON (works with both Outlines and XGrammar)
|
||||
response = requests.post(
|
||||
"http://localhost:30000/generate",
|
||||
json={
|
||||
"text": "Here is the information of the capital of France in the JSON format.\n",
|
||||
"sampling_params": {
|
||||
"temperature": 0,
|
||||
"max_new_tokens": 64,
|
||||
"json_schema": json_schema,
|
||||
},
|
||||
},
|
||||
)
|
||||
print(response.json())
|
||||
|
||||
# Regular expression (Outlines backend only)
|
||||
response = requests.post(
|
||||
"http://localhost:30000/generate",
|
||||
json={
|
||||
"text": "Paris is the capital of",
|
||||
"sampling_params": {
|
||||
"temperature": 0,
|
||||
"max_new_tokens": 64,
|
||||
"regex": "(France|England)",
|
||||
},
|
||||
},
|
||||
)
|
||||
print(response.json())
|
||||
|
||||
# EBNF (XGrammar backend only)
|
||||
response = requests.post(
|
||||
"http://localhost:30000/generate",
|
||||
json={
|
||||
"text": "Write a greeting.",
|
||||
"sampling_params": {
|
||||
"temperature": 0,
|
||||
"max_new_tokens": 64,
|
||||
"ebnf": 'root ::= "Hello" | "Hi" | "Hey"',
|
||||
},
|
||||
},
|
||||
)
|
||||
print(response.json())
|
||||
```
|
||||
|
||||
Detailed example in [structured outputs](../advanced_features/structured_outputs.ipynb).
|
||||
|
||||
### Custom logit processor
|
||||
|
||||
Launch a server with `--enable-custom-logit-processor` flag on.
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --enable-custom-logit-processor
|
||||
```
|
||||
|
||||
Define a custom logit processor that will always sample a specific token id.
|
||||
|
||||
```python
|
||||
from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
|
||||
|
||||
class DeterministicLogitProcessor(CustomLogitProcessor):
|
||||
"""A dummy logit processor that changes the logits to always
|
||||
sample the given token id.
|
||||
"""
|
||||
|
||||
def __call__(self, logits, custom_param_list):
|
||||
# Check that the number of logits matches the number of custom parameters
|
||||
assert logits.shape[0] == len(custom_param_list)
|
||||
key = "token_id"
|
||||
|
||||
for i, param_dict in enumerate(custom_param_list):
|
||||
# Mask all other tokens
|
||||
logits[i, :] = -float("inf")
|
||||
# Assign highest probability to the specified token
|
||||
logits[i, param_dict[key]] = 0.0
|
||||
return logits
|
||||
```
|
||||
|
||||
Send a request:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:30000/generate",
|
||||
json={
|
||||
"text": "The capital of France is",
|
||||
"custom_logit_processor": DeterministicLogitProcessor().to_str(),
|
||||
"sampling_params": {
|
||||
"temperature": 0.0,
|
||||
"max_new_tokens": 32,
|
||||
"custom_params": {"token_id": 5},
|
||||
},
|
||||
},
|
||||
)
|
||||
print(response.json())
|
||||
```
|
||||
253
docs/basic_usage/send_request.ipynb
Normal file
253
docs/basic_usage/send_request.ipynb
Normal file
@@ -0,0 +1,253 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Sending Requests\n",
|
||||
"This notebook provides a quick-start guide to use SGLang in chat completions after installation.\n",
|
||||
"\n",
|
||||
"- For Vision Language Models, see [OpenAI APIs - Vision](openai_api_vision.ipynb).\n",
|
||||
"- For Embedding Models, see [OpenAI APIs - Embedding](openai_api_embeddings.ipynb) and [Encode (embedding model)](native_api.html#Encode-(embedding-model)).\n",
|
||||
"- For Reward Models, see [Classify (reward model)](native_api.html#Classify-(reward-model))."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launch A Server"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"# This is equivalent to running the following command in your terminal\n",
|
||||
"# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
|
||||
" --host 0.0.0.0 --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using cURL\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import subprocess, json\n",
|
||||
"\n",
|
||||
"curl_command = f\"\"\"\n",
|
||||
"curl -s http://localhost:{port}/v1/chat/completions \\\n",
|
||||
" -H \"Content-Type: application/json\" \\\n",
|
||||
" -d '{{\"model\": \"qwen/qwen2.5-0.5b-instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
|
||||
"print_highlight(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Python Requests"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/v1/chat/completions\"\n",
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using OpenAI Python Client"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"print_highlight(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"# Use stream=True for streaming responses\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
" stream=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Handle the streaming output\n",
|
||||
"for chunk in response:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" print(chunk.choices[0].delta.content, end=\"\", flush=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Native Generation APIs\n",
|
||||
"\n",
|
||||
"You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": \"The capital of France is\",\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 32,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print_highlight(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests, json\n",
|
||||
"\n",
|
||||
"response = requests.post(\n",
|
||||
" f\"http://localhost:{port}/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": \"The capital of France is\",\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 32,\n",
|
||||
" },\n",
|
||||
" \"stream\": True,\n",
|
||||
" },\n",
|
||||
" stream=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"prev = 0\n",
|
||||
"for chunk in response.iter_lines(decode_unicode=False):\n",
|
||||
" chunk = chunk.decode(\"utf-8\")\n",
|
||||
" if chunk and chunk.startswith(\"data:\"):\n",
|
||||
" if chunk == \"data: [DONE]\":\n",
|
||||
" break\n",
|
||||
" data = json.loads(chunk[5:].strip(\"\\n\"))\n",
|
||||
" output = data[\"text\"]\n",
|
||||
" print(output[prev:], end=\"\", flush=True)\n",
|
||||
" prev = len(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
205
docs/conf.py
Normal file
205
docs/conf.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.abspath("../.."))
|
||||
|
||||
version_file = "../python/sglang/version.py"
|
||||
with open(version_file, "r") as f:
|
||||
exec(compile(f.read(), version_file, "exec"))
|
||||
__version__ = locals()["__version__"]
|
||||
|
||||
project = "SGLang"
|
||||
copyright = f"2023-{datetime.now().year}, SGLang"
|
||||
author = "SGLang Team"
|
||||
|
||||
version = __version__
|
||||
release = __version__
|
||||
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.autosummary",
|
||||
"sphinx.ext.napoleon",
|
||||
"sphinx.ext.viewcode",
|
||||
"sphinx.ext.autosectionlabel",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx_tabs.tabs",
|
||||
"myst_parser",
|
||||
"sphinx_copybutton",
|
||||
"sphinxcontrib.mermaid",
|
||||
"nbsphinx",
|
||||
"sphinx.ext.mathjax",
|
||||
]
|
||||
|
||||
nbsphinx_allow_errors = True
|
||||
nbsphinx_execute = "never"
|
||||
|
||||
autosectionlabel_prefix_document = True
|
||||
nbsphinx_allow_directives = True
|
||||
|
||||
|
||||
myst_enable_extensions = [
|
||||
"dollarmath",
|
||||
"amsmath",
|
||||
"deflist",
|
||||
"colon_fence",
|
||||
"html_image",
|
||||
"linkify",
|
||||
"substitution",
|
||||
]
|
||||
|
||||
myst_heading_anchors = 3
|
||||
|
||||
nbsphinx_kernel_name = "python3"
|
||||
nbsphinx_execute_arguments = [
|
||||
"--InlineBackend.figure_formats={'svg', 'pdf'}",
|
||||
"--InlineBackend.rc={'figure.dpi': 96}",
|
||||
]
|
||||
|
||||
|
||||
nb_render_priority = {
|
||||
"html": (
|
||||
"application/vnd.jupyter.widget-view+json",
|
||||
"application/javascript",
|
||||
"text/html",
|
||||
"image/svg+xml",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"text/markdown",
|
||||
"text/latex",
|
||||
"text/plain",
|
||||
)
|
||||
}
|
||||
|
||||
myst_enable_extensions = [
|
||||
"dollarmath",
|
||||
"amsmath",
|
||||
"deflist",
|
||||
"colon_fence",
|
||||
"html_image",
|
||||
"linkify",
|
||||
"substitution",
|
||||
]
|
||||
|
||||
myst_heading_anchors = 3
|
||||
myst_ref_domains = ["std", "py"]
|
||||
|
||||
templates_path = ["_templates"]
|
||||
|
||||
source_suffix = {
|
||||
".rst": "restructuredtext",
|
||||
".md": "markdown",
|
||||
}
|
||||
|
||||
master_doc = "index"
|
||||
|
||||
language = "en"
|
||||
|
||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
||||
|
||||
pygments_style = "sphinx"
|
||||
|
||||
html_theme = "sphinx_book_theme"
|
||||
html_logo = "_static/image/logo.png"
|
||||
html_favicon = "_static/image/logo.ico"
|
||||
html_title = project
|
||||
html_copy_source = True
|
||||
html_last_updated_fmt = ""
|
||||
|
||||
html_theme_options = {
|
||||
"repository_url": "https://github.com/sgl-project/sgl-project.github.io",
|
||||
"repository_branch": "main",
|
||||
"show_navbar_depth": 3,
|
||||
"max_navbar_depth": 4,
|
||||
"collapse_navbar": True,
|
||||
"use_edit_page_button": True,
|
||||
"use_source_button": True,
|
||||
"use_issues_button": True,
|
||||
"use_repository_button": True,
|
||||
"use_download_button": True,
|
||||
"use_sidenotes": True,
|
||||
"show_toc_level": 2,
|
||||
}
|
||||
|
||||
html_context = {
|
||||
"display_github": True,
|
||||
"github_user": "sgl-project",
|
||||
"github_repo": "sgl-project.github.io",
|
||||
"github_version": "main",
|
||||
"conf_py_path": "/docs/",
|
||||
}
|
||||
|
||||
html_static_path = ["_static"]
|
||||
html_css_files = ["css/custom_log.css"]
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.add_css_file("css/custom_log.css")
|
||||
|
||||
|
||||
myst_enable_extensions = [
|
||||
"dollarmath",
|
||||
"amsmath",
|
||||
"deflist",
|
||||
"colon_fence",
|
||||
]
|
||||
myst_heading_anchors = 5
|
||||
|
||||
htmlhelp_basename = "sglangdoc"
|
||||
|
||||
latex_elements = {}
|
||||
|
||||
latex_documents = [
|
||||
(master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"),
|
||||
]
|
||||
|
||||
man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)]
|
||||
|
||||
texinfo_documents = [
|
||||
(
|
||||
master_doc,
|
||||
"sglang",
|
||||
"sglang Documentation",
|
||||
author,
|
||||
"sglang",
|
||||
"One line description of project.",
|
||||
"Miscellaneous",
|
||||
),
|
||||
]
|
||||
|
||||
epub_title = project
|
||||
|
||||
epub_exclude_files = ["search.html"]
|
||||
|
||||
copybutton_prompt_text = r">>> |\.\.\. "
|
||||
copybutton_prompt_is_regexp = True
|
||||
|
||||
autodoc_preserve_defaults = True
|
||||
navigation_with_keys = False
|
||||
|
||||
autodoc_mock_imports = [
|
||||
"torch",
|
||||
"transformers",
|
||||
"triton",
|
||||
]
|
||||
|
||||
intersphinx_mapping = {
|
||||
"python": ("https://docs.python.org/3.12", None),
|
||||
"typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None),
|
||||
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
|
||||
"numpy": ("https://numpy.org/doc/stable", None),
|
||||
"torch": ("https://pytorch.org/docs/stable", None),
|
||||
}
|
||||
|
||||
html_theme = "sphinx_book_theme"
|
||||
|
||||
|
||||
nbsphinx_prolog = """
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
.output_area.stderr, .output_area.stdout {
|
||||
color: #d3d3d3 !important; /* light gray */
|
||||
}
|
||||
</style>
|
||||
"""
|
||||
22
docs/deploy.py
Normal file
22
docs/deploy.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# Deploy the documents
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def run_cmd(cmd):
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
|
||||
run_cmd("cd $DOC_SITE_PATH; git pull")
|
||||
|
||||
# (Optional) Remove old files
|
||||
# run_cmd("rm -rf $ALPA_SITE_PATH/*")
|
||||
|
||||
run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
|
||||
|
||||
cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||
run_cmd(
|
||||
f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
|
||||
)
|
||||
334
docs/developer_guide/bench_serving.md
Normal file
334
docs/developer_guide/bench_serving.md
Normal file
@@ -0,0 +1,334 @@
|
||||
## Bench Serving Guide
|
||||
|
||||
This guide explains how to benchmark online serving throughput and latency using `python -m sglang.bench_serving`. It supports multiple inference backends via OpenAI-compatible and native endpoints, and produces both console metrics and optional JSONL outputs.
|
||||
|
||||
### What it does
|
||||
|
||||
- Generates synthetic or dataset-driven prompts and submits them to a target serving endpoint
|
||||
- Measures throughput, time-to-first-token (TTFT), inter-token latency (ITL), per-request end-to-end latency, and more
|
||||
- Supports streaming or non-streaming modes, rate control, and concurrency limits
|
||||
|
||||
### Supported backends and endpoints
|
||||
|
||||
- `sglang` / `sglang-native`: `POST /generate`
|
||||
- `sglang-oai`, `vllm`, `lmdeploy`: `POST /v1/completions`
|
||||
- `sglang-oai-chat`, `vllm-chat`, `lmdeploy-chat`: `POST /v1/chat/completions`
|
||||
- `trt` (TensorRT-LLM): `POST /v2/models/ensemble/generate_stream`
|
||||
- `gserver`: Custom server (Not Implemented yet in this script)
|
||||
- `truss`: `POST /v1/models/model:predict`
|
||||
|
||||
If `--base-url` is provided, requests are sent to it. Otherwise, `--host` and `--port` are used. When `--model` is not provided, the script will attempt to query `GET /v1/models` for an available model ID (OpenAI-compatible endpoints).
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Python 3.8+
|
||||
- Dependencies typically used by this script: `aiohttp`, `numpy`, `requests`, `tqdm`, `transformers`, and for some datasets `datasets`, `pillow`, `pybase64`. Install as needed.
|
||||
- An inference server running and reachable via the endpoints above
|
||||
- If your server requires authentication, set environment variable `OPENAI_API_KEY` (used as `Authorization: Bearer <key>`)
|
||||
|
||||
### Quick start
|
||||
|
||||
Run a basic benchmark against an sglang server exposing `/generate`:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
|
||||
```
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--num-prompts 1000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct
|
||||
```
|
||||
|
||||
Or, using an OpenAI-compatible endpoint (completions):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend vllm \
|
||||
--base-url http://127.0.0.1:8000 \
|
||||
--num-prompts 1000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct
|
||||
```
|
||||
|
||||
### Datasets
|
||||
|
||||
Select with `--dataset-name`:
|
||||
|
||||
- `sharegpt` (default): loads ShareGPT-style pairs; optionally restrict with `--sharegpt-context-len` and override outputs with `--sharegpt-output-len`
|
||||
- `random`: random text lengths; sampled from ShareGPT token space
|
||||
- `random-ids`: random token ids (can lead to gibberish)
|
||||
- `random-image`: generates random images and wraps them in chat messages; supports custom resolutions via 'heightxwidth' format
|
||||
- `generated-shared-prefix`: synthetic dataset with shared long system prompts and short questions
|
||||
- `mmmu`: samples from MMMU (Math split) and includes images
|
||||
|
||||
Common dataset flags:
|
||||
|
||||
- `--num-prompts N`: number of requests
|
||||
- `--random-input-len`, `--random-output-len`, `--random-range-ratio`: for random/random-ids/random-image
|
||||
- `--random-image-num-images`, `--random-image-resolution`: for random-image dataset (supports presets 1080p/720p/360p or custom 'heightxwidth' format)
|
||||
- `--apply-chat-template`: apply tokenizer chat template when constructing prompts
|
||||
- `--dataset-path PATH`: file path for ShareGPT json; if blank and missing, it will be downloaded and cached
|
||||
|
||||
Generated Shared Prefix flags (for `generated-shared-prefix`):
|
||||
|
||||
- `--gsp-num-groups`
|
||||
- `--gsp-prompts-per-group`
|
||||
- `--gsp-system-prompt-len`
|
||||
- `--gsp-question-len`
|
||||
- `--gsp-output-len`
|
||||
|
||||
Random Image dataset flags (for `random-image`):
|
||||
|
||||
- `--random-image-num-images`: Number of images per request
|
||||
- `--random-image-resolution`: Image resolution; supports presets (1080p, 720p, 360p) or custom 'heightxwidth' format (e.g., 1080x1920, 512x768)
|
||||
|
||||
### Examples
|
||||
|
||||
1. To benchmark random-image dataset with 3 images per request, 500 prompts, 512 input length, and 512 output length, you can run:
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-3B-Instruct --disable-radix-cache
|
||||
```
|
||||
|
||||
```bash
|
||||
python -m sglang.bench_serving \
|
||||
--backend sglang-oai-chat \
|
||||
--dataset-name random-image \
|
||||
--num-prompts 500 \
|
||||
--random-image-num-images 3 \
|
||||
--random-image-resolution 720p \
|
||||
--random-input-len 512 \
|
||||
--random-output-len 512
|
||||
```
|
||||
|
||||
2. To benchmark random dataset with 3000 prompts, 1024 input length, and 1024 output length, you can run:
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--dataset-name random \
|
||||
--num-prompts 3000 \
|
||||
--random-input 1024 \
|
||||
--random-output 1024 \
|
||||
--random-range-ratio 0.5
|
||||
```
|
||||
|
||||
### Choosing model and tokenizer
|
||||
|
||||
- `--model` is required unless the backend exposes `GET /v1/models`, in which case the first model ID is auto-selected.
|
||||
- `--tokenizer` defaults to `--model`. Both can be HF model IDs or local paths.
|
||||
- For ModelScope workflows, setting `SGLANG_USE_MODELSCOPE=true` enables fetching via ModelScope (weights are skipped for speed).
|
||||
- If your tokenizer lacks a chat template, the script warns because token counting can be less robust for gibberish outputs.
|
||||
|
||||
### Rate, concurrency, and streaming
|
||||
|
||||
- `--request-rate`: requests per second. `inf` sends all immediately (burst). Non-infinite rate uses a Poisson process for arrival times.
|
||||
- `--max-concurrency`: caps concurrent in-flight requests regardless of arrival rate.
|
||||
- `--disable-stream`: switch to non-streaming mode when supported; TTFT then equals total latency for chat completions.
|
||||
|
||||
### Other key options
|
||||
|
||||
- `--output-file FILE.jsonl`: append JSONL results to file; auto-named if unspecified
|
||||
- `--output-details`: include per-request arrays (generated texts, errors, ttfts, itls, input/output lens)
|
||||
- `--extra-request-body '{"top_p":0.9,"temperature":0.6}'`: merged into payload (sampling params, etc.)
|
||||
- `--disable-ignore-eos`: pass through EOS behavior (varies by backend)
|
||||
- `--warmup-requests N`: run warmup requests with short output first (default 1)
|
||||
- `--flush-cache`: call `/flush_cache` (sglang) before main run
|
||||
- `--profile`: call `/start_profile` and `/stop_profile` (requires server to enable profiling, e.g., `SGLANG_TORCH_PROFILER_DIR`)
|
||||
- `--lora-name name1 name2 ...`: randomly pick one per request and pass to backend (e.g., `lora_path` for sglang)
|
||||
- `--tokenize-prompt`: send integer IDs instead of text (currently supports `--backend sglang` only)
|
||||
|
||||
### Authentication
|
||||
|
||||
If your target endpoint requires OpenAI-style auth, set:
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY=sk-...yourkey...
|
||||
```
|
||||
|
||||
The script will add `Authorization: Bearer $OPENAI_API_KEY` automatically for OpenAI-compatible routes.
|
||||
|
||||
### Metrics explained
|
||||
|
||||
Printed after each run:
|
||||
|
||||
- Request throughput (req/s)
|
||||
- Input token throughput (tok/s)
|
||||
- Output token throughput (tok/s)
|
||||
- Total token throughput (tok/s)
|
||||
- Concurrency: aggregate time of all requests divided by wall time
|
||||
- End-to-End Latency (ms): mean/median/std/p99 per-request total latency
|
||||
- Time to First Token (TTFT, ms): mean/median/std/p99 for streaming mode
|
||||
- Inter-Token Latency (ITL, ms): mean/median/std/p95/p99/max between tokens
|
||||
- TPOT (ms): Token processing time after first token, i.e., `(latency - ttft)/(tokens-1)`
|
||||
- Accept length (sglang-only, if available): speculative decoding accept length
|
||||
|
||||
The script also retokenizes generated text with the configured tokenizer and reports "retokenized" counts.
|
||||
|
||||
### JSONL output format
|
||||
|
||||
When `--output-file` is set, one JSON object is appended per run. Base fields:
|
||||
|
||||
- Arguments summary: backend, dataset, request_rate, max_concurrency, etc.
|
||||
- Duration and totals: completed, total_input_tokens, total_output_tokens, retokenized totals
|
||||
- Throughputs and latency statistics as printed in the console
|
||||
- `accept_length` when available (sglang)
|
||||
|
||||
With `--output-details`, an extended object also includes arrays:
|
||||
|
||||
- `input_lens`, `output_lens`
|
||||
- `ttfts`, `itls` (per request: ITL arrays)
|
||||
- `generated_texts`, `errors`
|
||||
|
||||
### End-to-end examples
|
||||
|
||||
1) sglang native `/generate` (streaming):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name random \
|
||||
--random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.5 \
|
||||
--num-prompts 2000 \
|
||||
--request-rate 100 \
|
||||
--max-concurrency 512 \
|
||||
--output-file sglang_random.jsonl --output-details
|
||||
```
|
||||
|
||||
2) OpenAI-compatible Completions (e.g., vLLM):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend vllm \
|
||||
--base-url http://127.0.0.1:8000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name sharegpt \
|
||||
--num-prompts 1000 \
|
||||
--sharegpt-output-len 256
|
||||
```
|
||||
|
||||
3) OpenAI-compatible Chat Completions (streaming):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend vllm-chat \
|
||||
--base-url http://127.0.0.1:8000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name random \
|
||||
--num-prompts 500 \
|
||||
--apply-chat-template
|
||||
```
|
||||
|
||||
4) Random images (VLM) with chat template:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--model your-vlm-model \
|
||||
--dataset-name random-image \
|
||||
--random-image-num-images 2 \
|
||||
--random-image-resolution 720p \
|
||||
--random-input-len 128 --random-output-len 256 \
|
||||
--num-prompts 200 \
|
||||
--apply-chat-template
|
||||
```
|
||||
|
||||
4a) Random images with custom resolution:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--model your-vlm-model \
|
||||
--dataset-name random-image \
|
||||
--random-image-num-images 1 \
|
||||
--random-image-resolution 512x768 \
|
||||
--random-input-len 64 --random-output-len 128 \
|
||||
--num-prompts 100 \
|
||||
--apply-chat-template
|
||||
```
|
||||
|
||||
5) Generated shared prefix (long system prompts + short questions):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name generated-shared-prefix \
|
||||
--gsp-num-groups 64 --gsp-prompts-per-group 16 \
|
||||
--gsp-system-prompt-len 2048 --gsp-question-len 128 --gsp-output-len 256 \
|
||||
--num-prompts 1024
|
||||
```
|
||||
|
||||
6) Tokenized prompts (ids) for strict length control (sglang only):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name random \
|
||||
--tokenize-prompt \
|
||||
--random-input-len 2048 --random-output-len 256 --random-range-ratio 0.2
|
||||
```
|
||||
|
||||
7) Profiling and cache flush (sglang):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--profile \
|
||||
--flush-cache
|
||||
```
|
||||
|
||||
8) TensorRT-LLM streaming endpoint:
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend trt \
|
||||
--base-url http://127.0.0.1:8000 \
|
||||
--model your-trt-llm-model \
|
||||
--dataset-name random \
|
||||
--num-prompts 100 \
|
||||
--disable-ignore-eos
|
||||
```
|
||||
|
||||
9) Evaluating large-scale KVCache sharing with mooncake trace (sglang only):
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--host 127.0.0.1 --port 30000 \
|
||||
--model mode-name \
|
||||
--dataset-name mooncake \
|
||||
--mooncake-slowdown-factor 1.0 \
|
||||
--mooncake-num-rounds 1000 \
|
||||
--mooncake-workload conversation|mooncake|agent|synthetic
|
||||
--use-trace-timestamps true \
|
||||
--random-output-len 256
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
- All requests failed: verify `--backend`, server URL/port, `--model`, and authentication. Check warmup errors printed by the script.
|
||||
- Throughput seems too low: adjust `--request-rate` and `--max-concurrency`; verify server batch size/scheduling; ensure streaming is enabled if appropriate.
|
||||
- Token counts look odd: prefer chat/instruct models with proper chat templates; otherwise tokenization of gibberish may be inconsistent.
|
||||
- Random-image/MMMU datasets: ensure you installed extra deps (`pillow`, `datasets`, `pybase64`).
|
||||
- Authentication errors (401/403): set `OPENAI_API_KEY` or disable auth on your server.
|
||||
|
||||
### Notes
|
||||
|
||||
- The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections.
|
||||
- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available.
|
||||
182
docs/developer_guide/benchmark_and_profiling.md
Normal file
182
docs/developer_guide/benchmark_and_profiling.md
Normal file
@@ -0,0 +1,182 @@
|
||||
# Benchmark and Profiling
|
||||
|
||||
## Benchmark
|
||||
|
||||
- Benchmark the latency of running a single static batch without a server. The arguments are the same as for `launch_server.py`.
|
||||
Note that this is a simplified test script without a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this simplified script does not.
|
||||
- Without a server (do not need to launch a server)
|
||||
```bash
|
||||
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
||||
```
|
||||
- With a server (please use `sglang.launch_server` to launch a server first and run the following command.)
|
||||
```bash
|
||||
python -m sglang.bench_one_batch_server --base-url http://127.0.0.1:30000 --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32
|
||||
```
|
||||
|
||||
|
||||
- Benchmark offline processing. This script will start an offline engine and run the benchmark.
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
|
||||
```
|
||||
|
||||
- Benchmark online serving. Please use `sglang.launch_server` to launch a server first and run the following command.
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving --backend sglang --num-prompt 10
|
||||
```
|
||||
|
||||
## Profile with PyTorch Profiler
|
||||
|
||||
[Pytorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) is a convenient basic tool to inspect kernel execution time, call stack, and kernel overlap and occupancy.
|
||||
|
||||
### Profile a server with `sglang.bench_serving`
|
||||
|
||||
```bash
|
||||
# set trace path
|
||||
export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
|
||||
|
||||
# start server
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
|
||||
|
||||
# send profiling request from client
|
||||
python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile
|
||||
```
|
||||
|
||||
Please make sure that the `SGLANG_TORCH_PROFILER_DIR` should be set at both server and client side, otherwise the trace file cannot be generated correctly . A secure way will be setting `SGLANG_TORCH_PROFILER_DIR` in the `.*rc` file of shell (e.g. `~/.bashrc` for bash shells).
|
||||
|
||||
For more details, please refer to [Bench Serving Guide](./bench_serving.md).
|
||||
|
||||
### Profile a server with `sglang.bench_offline_throughput`
|
||||
```bash
|
||||
export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
|
||||
|
||||
# profile one batch with bench_one_batch.py
|
||||
# batch size can be controlled with --batch argument
|
||||
python3 -m sglang.bench_one_batch --model-path meta-llama/Llama-3.1-8B-Instruct --batch 32 --input-len 1024 --output-len 10 --profile
|
||||
|
||||
# profile multiple batches with bench_offline_throughput.py
|
||||
python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
|
||||
```
|
||||
|
||||
### Profile a server with `sglang.profiler`
|
||||
|
||||
When the server is running (e.g., processing a decoding request), you can start live profiling immediately by sending a profile request to the server.
|
||||
|
||||
You can do this by running `python3 -m sglang.profiler`. For example:
|
||||
|
||||
```
|
||||
# Terminal 1: Send a generation request
|
||||
python3 -m sglang.test.send_one
|
||||
|
||||
# Terminal 2: Before the above request finishes, quickly launch the following command in a separate terminal.
|
||||
# It will generate a profile of the above request for several decoding batches.
|
||||
python3 -m sglang.profiler
|
||||
```
|
||||
|
||||
### Possible PyTorch bugs
|
||||
If in any cases you encounter the following error (for example, using qwen 2.5 VL):
|
||||
```bash
|
||||
RuntimeError: !stack.empty() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/autograd/profiler_python.cpp":983, please report a bug to PyTorch. Python replay stack is empty.
|
||||
```
|
||||
This is likely a PyTorch Bug reported in [Bug: vLLM Profiler](https://github.com/vllm-project/vllm/issues/18240) and [Bug: torch.profiler.profile](https://github.com/pytorch/pytorch/issues/101632). As a workaround, you may disable `with_stack` with an environment variable such as follows:
|
||||
```bash
|
||||
export SGLANG_PROFILE_WITH_STACK=False
|
||||
python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
|
||||
```
|
||||
|
||||
### View traces
|
||||
|
||||
Trace files can be loaded and visualized from:
|
||||
|
||||
1. https://ui.perfetto.dev/ (any browser)
|
||||
2. chrome://tracing (Chrome browser only)
|
||||
|
||||
If browser cannot open trace file due to its large size,
|
||||
client can generate a small trace file (<100MB) by controlling number of prompts and lengths of prompt outputs.
|
||||
For example, when profiling a server,
|
||||
|
||||
```bash
|
||||
python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 2 --sharegpt-output-len 100 --profile
|
||||
```
|
||||
|
||||
This command sets the number of prompts to 2 with `--num-prompts` argument and limits the length of output sequences to 100 with `--sharegpt-output-len` argument, which can generate a small trace file for browser to open smoothly.
|
||||
|
||||
Additionally, if you want to locate the SGLang Python source code through the cuda kernel in Trace, you need to disable CUDA Graph when starting the service. This can be done by using the `--disable-cuda-graph` parameter in the command to start the service.
|
||||
|
||||
## Profile with Nsight
|
||||
|
||||
[Nsight systems](https://docs.nvidia.com/nsight-systems/) is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
|
||||
|
||||
1. Prerequisite:
|
||||
|
||||
Install using apt, or run inside a [NVIDIA Docker container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) or [SGLang Docker container](https://github.com/sgl-project/sglang/tree/main/docker).
|
||||
|
||||
```bash
|
||||
# install nsys
|
||||
# https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html
|
||||
apt update
|
||||
apt install -y --no-install-recommends gnupg
|
||||
echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
|
||||
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
|
||||
apt update
|
||||
apt install nsight-systems-cli
|
||||
```
|
||||
|
||||
2. To profile a single batch, use
|
||||
|
||||
```bash
|
||||
nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node python3 -m sglang.bench_one_batch --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512
|
||||
```
|
||||
|
||||
3. To profile a server, e.g.
|
||||
|
||||
```bash
|
||||
# launch the server, set the delay and duration times according to needs
|
||||
# after the duration time has been used up, server will be killed by nsys
|
||||
|
||||
nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
|
||||
|
||||
# client
|
||||
python3 -m sglang.bench_serving --backend sglang --num-prompts 1000 --dataset-name random --random-input 1024 --random-output 512
|
||||
```
|
||||
|
||||
In practice, we recommend users to set `--duration` argument to a large value. Whenever user wants the server to stop profiling. Firstly run:
|
||||
|
||||
```bash
|
||||
nsys sessions list
|
||||
```
|
||||
|
||||
to get the session id in the form of `profile-XXXXX`, then run:
|
||||
|
||||
```bash
|
||||
nsys stop --session=profile-XXXXX
|
||||
```
|
||||
|
||||
to manually kill the profiler and generate `nsys-rep` files instantly.
|
||||
|
||||
4. Use NVTX to annotate code regions, e.g. to see their execution time.
|
||||
|
||||
```bash
|
||||
# install nvtx
|
||||
pip install nvtx
|
||||
```
|
||||
|
||||
```python
|
||||
# code snippets
|
||||
import nvtx
|
||||
with nvtx.annotate("description", color="color"):
|
||||
# some critical code
|
||||
```
|
||||
|
||||
## Other tips
|
||||
|
||||
1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
|
||||
2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using:
|
||||
|
||||
```bash
|
||||
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'
|
||||
```
|
||||
|
||||
3. You can use `--python-backtrace=cuda` to see python call stack for all CUDA kernels, as in PyTorch Profiler. (Caveat: this can cause inaccurately long kernel runtimes for CUDA event based timing)
|
||||
4. For more arguments see [Nsight Systems User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html).
|
||||
103
docs/developer_guide/contribution_guide.md
Normal file
103
docs/developer_guide/contribution_guide.md
Normal file
@@ -0,0 +1,103 @@
|
||||
# Contribution Guide
|
||||
|
||||
Welcome to **SGLang**! We appreciate your interest in contributing. This guide provides a concise overview of how to set up your environment, run tests, build documentation, and open a Pull Request (PR). Whether you’re fixing a small bug or developing a major feature, we encourage following these steps for a smooth contribution process.
|
||||
|
||||
## Install SGLang from Source
|
||||
|
||||
### Fork and clone the repository
|
||||
|
||||
**Note**: New contributors do **not** have the write permission to push to the official SGLang repo. Please fork the repository under your GitHub account, then clone your fork locally.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/<your_user_name>/sglang.git
|
||||
```
|
||||
|
||||
### Build from source
|
||||
|
||||
Refer to [Install SGLang from Source](../get_started/install.md#method-2-from-source).
|
||||
|
||||
## Format code with pre-commit
|
||||
|
||||
We use [pre-commit](https://pre-commit.com/) to maintain consistent code style checks. Before pushing your changes, please run:
|
||||
|
||||
```bash
|
||||
pip3 install pre-commit
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request.
|
||||
- **Do not commit** directly to the `main` branch. Always create a new branch (e.g., `feature/my-new-feature`), push your changes, and open a PR from that branch.
|
||||
|
||||
## Run and add unit tests
|
||||
|
||||
If you add a new feature or fix a bug, please add corresponding unit tests to ensure coverage and prevent regression.
|
||||
SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unittest.html) framework.
|
||||
For detailed instructions on running tests and integrating them into CI, refer to [test/README.md](https://github.com/sgl-project/sglang/tree/main/test/README.md).
|
||||
|
||||
## Write documentations
|
||||
|
||||
We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase.
|
||||
For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md).
|
||||
|
||||
## Test the accuracy
|
||||
If your code changes the model output, please run the accuracy tests. A quick sanity check is the few-shot GSM8K.
|
||||
|
||||
```
|
||||
# Launch a server
|
||||
python3 -m sglang.launch_server --model Qwen/Qwen2-7B-Instruct
|
||||
|
||||
# Evaluate
|
||||
python3 -m sglang.test.few_shot_gsm8k --num-questions 200
|
||||
```
|
||||
|
||||
Please note that the above script is primarily a sanity check, not a rigorous accuracy or speed test.
|
||||
This test can have significant variance (1%–5%) in accuracy due to batching and the non-deterministic nature of the inference engine.
|
||||
Also, do not rely on the "Latency/Output throughput" from this script, as it is not a proper speed test.
|
||||
|
||||
GSM8K is too easy for state-of-the-art models nowadays. Please try your own more challenging accuracy tests.
|
||||
You can find additional accuracy eval examples in:
|
||||
- [test_eval_accuracy_large.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_eval_accuracy_large.py)
|
||||
- [test_gpt_oss_1gpu.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_gpt_oss_1gpu.py)
|
||||
|
||||
## Benchmark the speed
|
||||
Refer to [Benchmark and Profiling](../developer_guide/benchmark_and_profiling.md).
|
||||
|
||||
## Request a review
|
||||
You can identify potential reviewers for your code by checking the [code owners](https://github.com/sgl-project/sglang/blob/main/.github/CODEOWNERS) and [reviewers](https://github.com/sgl-project/sglang/blob/main/.github/REVIEWERS.md) files.
|
||||
Another effective strategy is to review the file modification history and contact individuals who have frequently edited the files.
|
||||
If you modify files protected by code owners, their approval is required to merge the code.
|
||||
|
||||
## General code style
|
||||
- Avoid code duplication. If the same code snippet (more than five lines) appears multiple times, extract it into a shared function.
|
||||
- Minimize device synchronization. Reduce expensive CPU-GPU synchronization operations, such as `tensor.item()` or `tensor.cpu()`, whenever possible. Use vectorized code.
|
||||
- Keep files concise. If a file exceeds 2,000 lines of code, split it into multiple smaller files.
|
||||
- Prioritize extreme efficiency. SGLang is a runtime, and most of your code runs on the critical path for every request. Optimize all minor overheads as much as possible, especially in the model forward code.
|
||||
- A common pattern is some runtime checks in the model forward pass (e.g., [this](https://github.com/sgl-project/sglang/blob/f1b0eda55c2c4838e8ab90a0fac7fb1e3d7064ab/python/sglang/srt/models/deepseek_v2.py#L486-L491)). These are very likely the same for every layer. Please cache the result as a single boolean value whenever possible.
|
||||
- Strive to make functions as pure as possible. Avoid in-place modification of arguments.
|
||||
- When supporting new hardware or features, follow these guidelines:
|
||||
- Do not drastically change existing code.
|
||||
- Always prefer new files to introduce specific components for your new hardware (e.g., `allocator_ascend.py`).
|
||||
- If you write multiple if/else blocks for new features, ensure the common path (e.g., NVIDIA hardware or the existing code path) is the first branch.
|
||||
|
||||
## How to update sgl-kernel
|
||||
Since sglang and sgl-kernel are separate Python packages, our current GitHub CI infrastructure does not support updating a kernel and using it immediately within the same pull request (PR).
|
||||
To add a new kernel or modify an existing one in the sgl-kernel package, you must use multiple PRs.
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. Submit a PR to update the sgl-kernel source code without using it in sglang python package (e.g., [#8884](https://github.com/sgl-project/sglang/pull/8884/files)).
|
||||
2. Bump the version of sgl-kernel (e.g., [#9220](https://github.com/sgl-project/sglang/pull/9220/files)).
|
||||
- Once merged, this will trigger an automatic release of the sgl-kernel wheel to PyPI.
|
||||
- If not urgent, you can wait for other people to release the wheel. A new version will typically be released within one week.
|
||||
3. Apply the changes:
|
||||
- Update the sgl-kernel version in `sglang/python/pyproject.toml` to use the modified kernels.
|
||||
- Update the related caller code in the sglang to use the new kernel.
|
||||
|
||||
## Tips for newcomers
|
||||
|
||||
If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow.
|
||||
|
||||
If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://slack.sglang.ai).
|
||||
|
||||
Thank you for your interest in SGLang. Happy coding!
|
||||
108
docs/developer_guide/development_guide_using_docker.md
Normal file
108
docs/developer_guide/development_guide_using_docker.md
Normal file
@@ -0,0 +1,108 @@
|
||||
# Development Guide Using Docker
|
||||
|
||||
## Setup VSCode on a Remote Host
|
||||
(Optional - you can skip this step if you plan to run sglang dev container locally)
|
||||
|
||||
1. In the remote host, download `code` from [Https://code.visualstudio.com/docs/?dv=linux64cli](https://code.visualstudio.com/download) and run `code tunnel` in a shell.
|
||||
|
||||
Example
|
||||
```bash
|
||||
wget https://vscode.download.prss.microsoft.com/dbazure/download/stable/fabdb6a30b49f79a7aba0f2ad9df9b399473380f/vscode_cli_alpine_x64_cli.tar.gz
|
||||
tar xf vscode_cli_alpine_x64_cli.tar.gz
|
||||
|
||||
# https://code.visualstudio.com/docs/remote/tunnels
|
||||
./code tunnel
|
||||
```
|
||||
|
||||
2. In your local machine, press F1 in VSCode and choose "Remote Tunnels: Connect to Tunnel".
|
||||
|
||||
## Setup Docker Container
|
||||
|
||||
### Option 1. Use the default dev container automatically from VSCode
|
||||
There is a `.devcontainer` folder in the sglang repository root folder to allow VSCode to automatically start up within dev container. You can read more about this VSCode extension in VSCode official document [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).
|
||||

|
||||
(*Figure 1: Diagram from VSCode official documentation [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).*)
|
||||
|
||||
To enable this, you only need to:
|
||||
1. Start Visual Studio Code and install [VSCode dev container extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers).
|
||||
2. Press F1, type and choose "Dev Container: Open Folder in Container.
|
||||
3. Input the `sglang` local repo path in your machine and press enter.
|
||||
|
||||
The first time you open it in dev container might take longer due to docker pull and build. Once it's successful, you should set on your status bar at the bottom left displaying that you are in a dev container:
|
||||
|
||||

|
||||
|
||||
Now when you run `sglang.launch_server` in the VSCode terminal or start debugging using F5, sglang server will be started in the dev container with all your local changes applied automatically:
|
||||
|
||||

|
||||
|
||||
|
||||
### Option 2. Start up containers manually (advanced)
|
||||
|
||||
The following startup command is an example for internal development by the SGLang team. You can **modify or add directory mappings as needed**, especially for model weight downloads, to prevent repeated downloads by different Docker containers.
|
||||
|
||||
❗️ **Note on RDMA**
|
||||
|
||||
1. `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them but keeping them there does not harm. Thus, we enable these two flags by default in the commands below.
|
||||
2. You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
|
||||
|
||||
```bash
|
||||
# Change the name to yours
|
||||
docker run -itd --shm-size 32g --gpus all -v <volumes-to-mount> --ipc=host --network=host --privileged --name sglang_dev lmsysorg/sglang:dev /bin/zsh
|
||||
docker exec -it sglang_dev /bin/zsh
|
||||
```
|
||||
Some useful volumes to mount are:
|
||||
1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
|
||||
2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.
|
||||
|
||||
Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.
|
||||
```bash
|
||||
docker run -itd --shm-size 32g --gpus all -v /opt/dlami/nvme/.cache:/root/.cache --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
|
||||
docker exec -it sglang_zhyncs /bin/zsh
|
||||
```
|
||||
Example 2: Mounting both HuggingFace cache and local SGLang repo. Local code changes are automatically synced to the devcontainer as the SGLang is installed in editable mode in the dev image.
|
||||
```bash
|
||||
docker run -itd --shm-size 32g --gpus all -v $HOME/.cache/huggingface/:/root/.cache/huggingface -v $HOME/src/sglang:/sgl-workspace/sglang --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
|
||||
docker exec -it sglang_zhyncs /bin/zsh
|
||||
```
|
||||
## Debug SGLang with VSCode Debugger
|
||||
1. (Create if not exist) open `launch.json` in VSCode.
|
||||
2. Add the following config and save. Please note that you can edit the script as needed to apply different parameters or debug a different program (e.g. benchmark script).
|
||||
```JSON
|
||||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python Debugger: launch_server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "sglang.launch_server",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"--model-path", "meta-llama/Llama-3.2-1B",
|
||||
"--host", "0.0.0.0",
|
||||
"--port", "30000",
|
||||
"--trust-remote-code",
|
||||
],
|
||||
"justMyCode": false
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
3. Press "F5" to start. VSCode debugger will ensure that the program will pause at the breakpoints even if the program is running at remote SSH/Tunnel host + dev container.
|
||||
|
||||
## Profile
|
||||
|
||||
```bash
|
||||
# Change batch size, input, output and add `disable-cuda-graph` (for easier analysis)
|
||||
# e.g. DeepSeek V3
|
||||
nsys profile -o deepseek_v3 python3 -m sglang.bench_one_batch --batch-size 1 --input 128 --output 256 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --tp 8 --disable-cuda-graph
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
```bash
|
||||
# e.g. gsm8k 8 shot
|
||||
python3 benchmark/gsm8k/bench_sglang.py --num-questions 2000 --parallel 2000 --num-shots 8
|
||||
```
|
||||
18
docs/developer_guide/release_process.md
Normal file
18
docs/developer_guide/release_process.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# PyPI Package Release Process
|
||||
|
||||
## Update the version in code
|
||||
Update the package version in `python/pyproject.toml` and `python/sglang/__init__.py`.
|
||||
|
||||
## Upload the PyPI package
|
||||
|
||||
```
|
||||
pip install build twine
|
||||
```
|
||||
|
||||
```
|
||||
cd python
|
||||
bash upload_pypi.sh
|
||||
```
|
||||
|
||||
## Make a release in GitHub
|
||||
Make a new release https://github.com/sgl-project/sglang/releases/new.
|
||||
49
docs/developer_guide/setup_github_runner.md
Normal file
49
docs/developer_guide/setup_github_runner.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# Set Up Self-Hosted Runners for GitHub Action
|
||||
|
||||
## Add a Runner
|
||||
|
||||
### Step 1: Start a docker container.
|
||||
|
||||
You can mount a folder for the shared huggingface model weights cache. The command below uses `/tmp/huggingface` as an example.
|
||||
|
||||
```
|
||||
docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
|
||||
# Nvidia
|
||||
docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
|
||||
# AMD
|
||||
docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.0rc1-rocm630 /bin/bash
|
||||
# AMD just the last 2 GPUs
|
||||
docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.0rc1-rocm630 /bin/bash
|
||||
```
|
||||
|
||||
### Step 2: Configure the runner by `config.sh`
|
||||
|
||||
Run these commands inside the container.
|
||||
|
||||
```
|
||||
apt update && apt install -y curl python3-pip git
|
||||
export RUNNER_ALLOW_RUNASROOT=1
|
||||
```
|
||||
|
||||
Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux to run `config.sh`
|
||||
|
||||
**Notes**
|
||||
- Do not need to specify the runner group
|
||||
- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
|
||||
- Do not need to change the work folder.
|
||||
|
||||
### Step 3: Run the runner by `run.sh`
|
||||
|
||||
- Set up environment variables
|
||||
```
|
||||
export HF_HOME=/hf_home
|
||||
export SGLANG_IS_IN_CI=true
|
||||
export HF_TOKEN=hf_xxx
|
||||
export OPENAI_API_KEY=sk-xxx
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
```
|
||||
|
||||
- Run it forever
|
||||
```
|
||||
while true; do ./run.sh; echo "Restarting..."; sleep 2; done
|
||||
```
|
||||
131
docs/get_started/install.md
Normal file
131
docs/get_started/install.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# Install SGLang
|
||||
|
||||
You can install SGLang using one of the methods below.
|
||||
|
||||
This page primarily applies to common NVIDIA GPU platforms.
|
||||
For other or newer platforms, please refer to the dedicated pages for [NVIDIA Blackwell GPUs](../platforms/blackwell_gpu.md), [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).
|
||||
|
||||
## Method 1: With pip or uv
|
||||
|
||||
It is recommended to use uv for faster installation:
|
||||
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
pip install uv
|
||||
uv pip install "sglang[all]>=0.5.2"
|
||||
```
|
||||
|
||||
**Quick fixes to common problems**
|
||||
- If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions:
|
||||
1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
|
||||
2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
|
||||
|
||||
## Method 2: From source
|
||||
|
||||
```bash
|
||||
# Use the last release branch
|
||||
git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
|
||||
cd sglang
|
||||
|
||||
# Install the python packages
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
```
|
||||
|
||||
**Quick fixes to common problems**
|
||||
- If you want to develop SGLang, it is recommended to use docker. Please refer to [setup docker container](../developer_guide/development_guide_using_docker.md#setup-docker-container). The docker image is `lmsysorg/sglang:dev`.
|
||||
|
||||
## Method 3: Using docker
|
||||
|
||||
The docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
|
||||
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
||||
|
||||
```bash
|
||||
docker run --gpus all \
|
||||
--shm-size 32g \
|
||||
-p 30000:30000 \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
--ipc=host \
|
||||
lmsysorg/sglang:latest \
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
||||
```
|
||||
|
||||
## Method 4: Using Kubernetes
|
||||
|
||||
Please check out [OME](https://github.com/sgl-project/ome), a Kubernetes operator for enterprise-grade management and serving of large language models (LLMs).
|
||||
|
||||
<details>
|
||||
<summary>More</summary>
|
||||
|
||||
1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
|
||||
|
||||
Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
|
||||
|
||||
2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
|
||||
|
||||
Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
|
||||
|
||||
</details>
|
||||
|
||||
## Method 5: Using docker compose
|
||||
|
||||
<details>
|
||||
<summary>More</summary>
|
||||
|
||||
> This method is recommended if you plan to serve it as a service.
|
||||
> A better approach is to use the [k8s-sglang-service.yaml](https://github.com/sgl-project/sglang/blob/main/docker/k8s-sglang-service.yaml).
|
||||
|
||||
1. Copy the [compose.yml](https://github.com/sgl-project/sglang/blob/main/docker/compose.yaml) to your local machine
|
||||
2. Execute the command `docker compose up -d` in your terminal.
|
||||
</details>
|
||||
|
||||
## Method 6: Run on Kubernetes or Clouds with SkyPilot
|
||||
|
||||
<details>
|
||||
<summary>More</summary>
|
||||
|
||||
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
||||
|
||||
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
||||
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
||||
<details>
|
||||
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
||||
|
||||
```yaml
|
||||
# sglang.yaml
|
||||
envs:
|
||||
HF_TOKEN: null
|
||||
|
||||
resources:
|
||||
image_id: docker:lmsysorg/sglang:latest
|
||||
accelerators: A100
|
||||
ports: 30000
|
||||
|
||||
run: |
|
||||
conda deactivate
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
```bash
|
||||
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
||||
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
||||
|
||||
# Get the HTTP API endpoint
|
||||
sky status --endpoint 30000 sglang
|
||||
```
|
||||
|
||||
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
||||
</details>
|
||||
|
||||
## Common Notes
|
||||
|
||||
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub.
|
||||
- To reinstall flashinfer locally, use the following command: `pip3 install --upgrade flashinfer-python --force-reinstall --no-deps` and then delete the cache with `rm -rf ~/.cache/flashinfer`.
|
||||
- If you only need to use OpenAI API models with the frontend language, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
||||
- The language frontend operates independently of the backend runtime. You can install the frontend locally without needing a GPU, while the backend can be set up on a GPU-enabled machine. To install the frontend, run `pip install sglang`, and for the backend, use `pip install sglang[srt]`. `srt` is the abbreviation of SGLang runtime.
|
||||
95
docs/index.rst
Normal file
95
docs/index.rst
Normal file
@@ -0,0 +1,95 @@
|
||||
SGLang Documentation
|
||||
====================
|
||||
|
||||
SGLang is a fast serving framework for large language models and vision language models.
|
||||
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
||||
The core features include:
|
||||
|
||||
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
||||
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
||||
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
||||
- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Get Started
|
||||
|
||||
get_started/install.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Basic Usage
|
||||
|
||||
basic_usage/send_request.ipynb
|
||||
basic_usage/openai_api.rst
|
||||
basic_usage/offline_engine_api.ipynb
|
||||
basic_usage/native_api.ipynb
|
||||
basic_usage/sampling_params.md
|
||||
basic_usage/deepseek.md
|
||||
basic_usage/gpt_oss.md
|
||||
basic_usage/llama4.md
|
||||
basic_usage/qwen3.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Advanced Features
|
||||
|
||||
advanced_features/server_arguments.md
|
||||
advanced_features/hyperparameter_tuning.md
|
||||
advanced_features/speculative_decoding.ipynb
|
||||
advanced_features/structured_outputs.ipynb
|
||||
advanced_features/structured_outputs_for_reasoning_models.ipynb
|
||||
advanced_features/tool_parser.ipynb
|
||||
advanced_features/separate_reasoning.ipynb
|
||||
advanced_features/quantization.md
|
||||
advanced_features/lora.ipynb
|
||||
advanced_features/pd_disaggregation.md
|
||||
advanced_features/vlm_query.ipynb
|
||||
advanced_features/router.md
|
||||
advanced_features/observability.md
|
||||
advanced_features/attention_backend.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Supported Models
|
||||
|
||||
supported_models/generative_models.md
|
||||
supported_models/multimodal_language_models.md
|
||||
supported_models/embedding_models.md
|
||||
supported_models/reward_models.md
|
||||
supported_models/rerank_models.md
|
||||
supported_models/support_new_models.md
|
||||
supported_models/transformers_fallback.md
|
||||
supported_models/modelscope.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Hardware Platforms
|
||||
|
||||
platforms/amd_gpu.md
|
||||
platforms/blackwell_gpu.md
|
||||
platforms/cpu_server.md
|
||||
platforms/tpu.md
|
||||
platforms/nvidia_jetson.md
|
||||
platforms/ascend_npu.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Developer Guide
|
||||
|
||||
developer_guide/contribution_guide.md
|
||||
developer_guide/development_guide_using_docker.md
|
||||
developer_guide/benchmark_and_profiling.md
|
||||
developer_guide/bench_serving.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: References
|
||||
|
||||
references/faq.md
|
||||
references/environment_variables.md
|
||||
references/production_metrics.md
|
||||
references/multi_node_deployment/multi_node_index.rst
|
||||
references/custom_chat_template.md
|
||||
references/frontend/frontend_index.rst
|
||||
references/learn_more.md
|
||||
158
docs/platforms/amd_gpu.md
Normal file
158
docs/platforms/amd_gpu.md
Normal file
@@ -0,0 +1,158 @@
|
||||
# AMD GPUs
|
||||
|
||||
This document describes how run SGLang on AMD GPUs. If you encounter issues or have questions, please [open an issue](https://github.com/sgl-project/sglang/issues).
|
||||
|
||||
## System Configuration
|
||||
|
||||
When using AMD GPUs (such as MI300X), certain system-level optimizations help ensure stable performance. Here we take MI300X as an example. AMD provides official documentation for MI300X optimization and system tuning:
|
||||
|
||||
- [AMD MI300X Tuning Guides](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html)
|
||||
- [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/vllm-benchmark.html)
|
||||
- [AMD Instinct MI300X System Optimization](https://rocm.docs.amd.com/en/latest/how-to/system-optimization/mi300x.html)
|
||||
- [AMD Instinct MI300X Workload Optimization](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html)
|
||||
- [Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html)
|
||||
|
||||
**NOTE:** We strongly recommend reading these docs and guides entirely to fully utilize your system.
|
||||
|
||||
Below are a few key settings to confirm or enable for SGLang:
|
||||
|
||||
### Update GRUB Settings
|
||||
|
||||
In `/etc/default/grub`, append the following to `GRUB_CMDLINE_LINUX`:
|
||||
|
||||
```text
|
||||
pci=realloc=off iommu=pt
|
||||
```
|
||||
|
||||
Afterward, run `sudo update-grub` (or your distro’s equivalent) and reboot.
|
||||
|
||||
### Disable NUMA Auto-Balancing
|
||||
|
||||
```bash
|
||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||
```
|
||||
|
||||
You can automate or verify this change using [this helpful script](https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh).
|
||||
|
||||
Again, please go through the entire documentation to confirm your system is using the recommended configuration.
|
||||
|
||||
## Install SGLang
|
||||
|
||||
You can install SGLang using one of the methods below.
|
||||
|
||||
### Install from Source
|
||||
|
||||
```bash
|
||||
# Use the last release branch
|
||||
git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
|
||||
cd sglang
|
||||
|
||||
# Compile sgl-kernel
|
||||
pip install --upgrade pip
|
||||
cd sgl-kernel
|
||||
python setup_rocm.py install
|
||||
|
||||
# Install sglang python package
|
||||
cd ..
|
||||
pip install -e "python[all_hip]"
|
||||
```
|
||||
|
||||
### Install Using Docker (Recommended)
|
||||
|
||||
The docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile.rocm](https://github.com/sgl-project/sglang/tree/main/docker).
|
||||
|
||||
The steps below show how to build and use an image.
|
||||
|
||||
1. Build the docker image.
|
||||
If you use pre-built images, you can skip this step and replace `sglang_image` with the pre-built image names in the steps below.
|
||||
|
||||
```bash
|
||||
docker build -t sglang_image -f Dockerfile.rocm .
|
||||
```
|
||||
|
||||
2. Create a convenient alias.
|
||||
|
||||
```bash
|
||||
alias drun='docker run -it --rm --network=host --privileged --device=/dev/kfd --device=/dev/dri \
|
||||
--ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
-v $HOME/dockerx:/dockerx \
|
||||
-v /data:/data'
|
||||
```
|
||||
|
||||
If you are using RDMA, please note that:
|
||||
- `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them.
|
||||
- You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
|
||||
|
||||
3. Launch the server.
|
||||
|
||||
**NOTE:** Replace `<secret>` below with your [huggingface hub token](https://huggingface.co/docs/hub/en/security-tokens).
|
||||
|
||||
```bash
|
||||
drun -p 30000:30000 \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
sglang_image \
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path NousResearch/Meta-Llama-3.1-8B \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
4. To verify the utility, you can run a benchmark in another terminal or refer to [other docs](https://docs.sglang.ai/backend/openai_api_completions.html) to send requests to the engine.
|
||||
|
||||
```bash
|
||||
drun sglang_image \
|
||||
python3 -m sglang.bench_serving \
|
||||
--backend sglang \
|
||||
--dataset-name random \
|
||||
--num-prompts 4000 \
|
||||
--random-input 128 \
|
||||
--random-output 128
|
||||
```
|
||||
|
||||
With your AMD system properly configured and SGLang installed, you can now fully leverage AMD hardware to power SGLang’s machine learning capabilities.
|
||||
|
||||
## Examples
|
||||
|
||||
### Running DeepSeek-V3
|
||||
|
||||
The only difference when running DeepSeek-V3 is in how you start the server. Here's an example command:
|
||||
|
||||
```bash
|
||||
drun -p 30000:30000 \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--ipc=host \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
sglang_image \
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3 \ # <- here
|
||||
--tp 8 \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
[Running DeepSeek-R1 on a single NDv5 MI300X VM](https://techcommunity.microsoft.com/blog/azurehighperformancecomputingblog/running-deepseek-r1-on-a-single-ndv5-mi300x-vm/4372726) could also be a good reference.
|
||||
|
||||
### Running Llama3.1
|
||||
|
||||
Running Llama3.1 is nearly identical to running DeepSeek-V3. The only difference is in the model specified when starting the server, shown by the following example command:
|
||||
|
||||
```bash
|
||||
drun -p 30000:30000 \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--ipc=host \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
sglang_image \
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ # <- here
|
||||
--tp 8 \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
### Warmup Step
|
||||
|
||||
When the server displays `The server is fired up and ready to roll!`, it means the startup is successful.
|
||||
206
docs/platforms/ascend_npu.md
Normal file
206
docs/platforms/ascend_npu.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# Ascend NPUs
|
||||
|
||||
You can install SGLang using any of the methods below. Please go through `System Settings` section to ensure the clusters are roaring at max performance. Feel free to leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) if you encounter any issues or have any problems.
|
||||
|
||||
## System Settings
|
||||
|
||||
### CPU performance power scheme
|
||||
|
||||
The default power scheme on Ascend hardware is `ondemand` which could affect performance, changing it to `performance` is recommended.
|
||||
|
||||
```shell
|
||||
echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
|
||||
|
||||
# Make sure changes are applied successfully
|
||||
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor # shows performance
|
||||
```
|
||||
|
||||
### Disable NUMA balancing
|
||||
|
||||
```shell
|
||||
sudo sysctl -w kernel.numa_balancing=0
|
||||
|
||||
# Check
|
||||
cat /proc/sys/kernel/numa_balancing # shows 0
|
||||
```
|
||||
|
||||
### Prevent swapping out system memory
|
||||
|
||||
```shell
|
||||
sudo sysctl -w vm.swappiness=10
|
||||
|
||||
# Check
|
||||
cat /proc/sys/vm/swappiness # shows 10
|
||||
```
|
||||
|
||||
## Installing SGLang
|
||||
|
||||
### Method 1: Installing from source with prerequisites
|
||||
|
||||
#### Python Version
|
||||
|
||||
Only `python==3.11` is supported currently. If you don't want to break system pre-installed python, try installing with [conda](https://github.com/conda/conda).
|
||||
|
||||
```shell
|
||||
conda create --name sglang_npu python=3.11
|
||||
conda activate sglang_npu
|
||||
```
|
||||
|
||||
#### MemFabric Adaptor
|
||||
|
||||
_TODO: MemFabric is still a working project yet open sourced til August/September, 2025. We will release it as prebuilt wheel package for now._
|
||||
|
||||
_Notice: Prebuilt wheel package is based on `aarch64`, please leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) to let us know the requests for `amd64` build._
|
||||
|
||||
MemFabric Adaptor is a drop-in replacement of Mooncake Transfer Engine that enables KV cache transfer on Ascend NPU clusters.
|
||||
|
||||
```shell
|
||||
MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
|
||||
MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
|
||||
wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}"
|
||||
```
|
||||
|
||||
#### Pytorch and Pytorch Framework Adaptor on Ascend
|
||||
|
||||
Only `torch==2.6.0` is supported currently due to NPUgraph and Triton-on-Ascend's limitation, however a more generalized version will be release by the end of September, 2025.
|
||||
|
||||
```shell
|
||||
PYTORCH_VERSION=2.6.0
|
||||
TORCHVISION_VERSION=0.21.0
|
||||
pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
PTA_VERSION="v7.1.0.1-pytorch2.6.0"
|
||||
PTA_NAME="torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
|
||||
PTA_URL="https://gitee.com/ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_WHL_NAME}"
|
||||
wget -O "${PTA_NAME}" "${PTA_URL}" && pip install "./${PTA_NAME}"
|
||||
```
|
||||
|
||||
#### vLLM
|
||||
|
||||
vLLM is still a major prerequisite on Ascend NPU. Because of `torch==2.6.0` limitation, only vLLM v0.8.5 is supported.
|
||||
|
||||
```shell
|
||||
VLLM_TAG=v0.8.5
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
|
||||
(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .)
|
||||
```
|
||||
|
||||
#### Triton on Ascend
|
||||
|
||||
_Notice:_ We recommend installing triton-ascend from source due to its rapid development, the version on PYPI can't keep up for know. This problem will be solved on Sep. 2025, afterwards `pip install` would be the one and only installing method.
|
||||
|
||||
Please follow Triton-on-Ascend's [installation guide from source](https://gitee.com/ascend/triton-ascend#2%E6%BA%90%E4%BB%A3%E7%A0%81%E5%AE%89%E8%A3%85-triton-ascend) to install the latest `triton-ascend` package.
|
||||
|
||||
#### DeepEP-compatible Library
|
||||
|
||||
We are also providing a DeepEP-compatible Library as a drop-in replacement of deepseek-ai's DeepEP library, check the [installation guide](https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README.md).
|
||||
|
||||
#### Installing SGLang from source
|
||||
|
||||
```shell
|
||||
# Use the last release branch
|
||||
git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
|
||||
cd sglang
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install -e python[srt_npu]
|
||||
```
|
||||
|
||||
### Method 2: Using docker
|
||||
|
||||
__Notice:__ `--privileged` and `--network=host` are required by RDMA, which is typically needed by Ascend NPU clusters.
|
||||
|
||||
__Notice:__ The following docker command is based on Atlas 800I A3 machines. If you are using Atlas 800I A2, make sure only `davinci[0-7]` are mapped into container.
|
||||
|
||||
```shell
|
||||
# Clone the SGLang repository
|
||||
git clone https://github.com/sgl-project/sglang.git
|
||||
cd sglang/docker
|
||||
|
||||
# Build the docker image
|
||||
docker build -t sglang-npu:main -f Dockerfile.npu .
|
||||
|
||||
alias drun='docker run -it --rm --privileged --network=host --ipc=host --shm-size=16g \
|
||||
--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 \
|
||||
--device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 \
|
||||
--device=/dev/davinci8 --device=/dev/davinci9 --device=/dev/davinci10 --device=/dev/davinci11 \
|
||||
--device=/dev/davinci12 --device=/dev/davinci13 --device=/dev/davinci14 --device=/dev/davinci15 \
|
||||
--device=/dev/davinci_manager --device=/dev/hisi_hdc \
|
||||
--volume /usr/local/sbin:/usr/local/sbin --volume /usr/local/Ascend/driver:/usr/local/Ascend/driver \
|
||||
--volume /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \
|
||||
--volume /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
--volume /var/queue_schedule:/var/queue_schedule --volume ~/.cache/:/root/.cache/'
|
||||
|
||||
drun --env "HF_TOKEN=<secret>" \
|
||||
sglang-npu:main \
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --attention-backend ascend --host 0.0.0.0 --port 30000
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Running DeepSeek-V3
|
||||
|
||||
Running DeepSeek with PD disaggregation on 2 x Atlas 800I A3.
|
||||
Model weights could be found [here](https://modelers.cn/models/State_Cloud/Deepseek-R1-bf16-hfd-w8a8).
|
||||
|
||||
Prefill:
|
||||
|
||||
```shell
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"
|
||||
|
||||
drun sglang-npu:main \
|
||||
python3 -m sglang.launch_server --model-path State_Cloud/DeepSeek-R1-bf16-hfd-w8a8 \
|
||||
--trust-remote-code \
|
||||
--attention-backend ascend \
|
||||
--mem-fraction-static 0.8 \
|
||||
--quantization w8a8_int8 \
|
||||
--tp-size 16 \
|
||||
--dp-size 1 \
|
||||
--nnodes 1 \
|
||||
--node-rank 0 \
|
||||
--disaggregation-mode prefill \
|
||||
--disaggregation-bootstrap-port 6657 \
|
||||
--disaggregation-transfer-backend ascend \
|
||||
--dist-init-addr <PREFILL_HOST_IP>:6688 \
|
||||
--host <PREFILL_HOST_IP> \
|
||||
--port 8000
|
||||
```
|
||||
|
||||
Decode:
|
||||
|
||||
```shell
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"
|
||||
export HCCL_BUFFSIZE=200
|
||||
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=24
|
||||
|
||||
drun sglang-npu:main \
|
||||
python3 -m sglang.launch_server --model-path State_Cloud/DeepSeek-R1-bf16-hfd-w8a8 \
|
||||
--trust-remote-code \
|
||||
--attention-backend ascend \
|
||||
--mem-fraction-static 0.8 \
|
||||
--quantization w8a8_int8 \
|
||||
--enable-deepep-moe \
|
||||
--deepep-mode low_latency \
|
||||
--tp-size 16 \
|
||||
--dp-size 1 \
|
||||
--ep-size 16 \
|
||||
--nnodes 1 \
|
||||
--node-rank 0 \
|
||||
--disaggregation-mode decode \
|
||||
--disaggregation-transfer-backend ascend \
|
||||
--dist-init-addr <DECODE_HOST_IP>:6688 \
|
||||
--host <DECODE_HOST_IP> \
|
||||
--port 8001
|
||||
```
|
||||
|
||||
Mini_LB:
|
||||
|
||||
```shell
|
||||
drun sglang-npu:main \
|
||||
python -m sglang.srt.disaggregation.launch_lb \
|
||||
--prefill http://<PREFILL_HOST_IP>:8000 \
|
||||
--decode http://<DECODE_HOST_IP>:8001 \
|
||||
--host 127.0.0.1 --port 5000
|
||||
```
|
||||
9
docs/platforms/blackwell_gpu.md
Normal file
9
docs/platforms/blackwell_gpu.md
Normal file
@@ -0,0 +1,9 @@
|
||||
# Blackwell GPUs
|
||||
|
||||
We will release the pre-built wheels soon. Before that, please try to compile from source or check the blackwell docker images from [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
||||
|
||||
## B200 with x86 CPUs
|
||||
TODO
|
||||
|
||||
## GB200/GB300 with ARM CPUs
|
||||
TODO
|
||||
204
docs/platforms/cpu_server.md
Normal file
204
docs/platforms/cpu_server.md
Normal file
@@ -0,0 +1,204 @@
|
||||
# CPU Servers
|
||||
|
||||
The document addresses how to set up the [SGLang](https://github.com/sgl-project/sglang) environment and run LLM inference on CPU servers.
|
||||
Specifically, SGLang is well optimized on the CPUs equipped with Intel® AMX® Instructions,
|
||||
which are 4th generation or newer Intel® Xeon® Scalable Processors.
|
||||
|
||||
## Optimized Model List
|
||||
|
||||
A list of popular LLMs are optimized and run efficiently on CPU,
|
||||
including the most notable open-source models like Llama series, Qwen series,
|
||||
and the phenomenal high-quality reasoning model DeepSeek-R1.
|
||||
|
||||
| Model Name | BF16 | w8a8_int8 | FP8 |
|
||||
|:---:|:---:|:---:|:---:|
|
||||
| DeepSeek-R1 | | [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8) | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
|
||||
| Llama-3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | [RedHatAI/Llama-3.2-3B-quantized.w8a8](https://huggingface.co/RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8) | |
|
||||
| Llama-3.1-8B | [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8](https://huggingface.co/RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8) | |
|
||||
| QwQ-32B | | [RedHatAI/QwQ-32B-quantized.w8a8](https://huggingface.co/RedHatAI/QwQ-32B-quantized.w8a8) | |
|
||||
| DeepSeek-Distilled-Llama | | [RedHatAI/DeepSeek-R1-Distill-Llama-70B-quantized.w8a8](https://huggingface.co/RedHatAI/DeepSeek-R1-Distill-Llama-70B-quantized.w8a8) | |
|
||||
| Qwen3-235B | | | [Qwen/Qwen3-235B-A22B-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8) |
|
||||
|
||||
**Note:** The model identifiers listed in the table above
|
||||
have been verified on 6th Gen Intel® Xeon® P-core platforms.
|
||||
|
||||
## Installation
|
||||
|
||||
### Install Using Docker
|
||||
|
||||
It is recommended to use Docker for setting up the SGLang environment.
|
||||
A [Dockerfile](https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile.xeon) is provided to facilitate the installation.
|
||||
Replace `<secret>` below with your [HuggingFace access token](https://huggingface.co/docs/hub/en/security-tokens).
|
||||
|
||||
```bash
|
||||
# Clone the SGLang repository
|
||||
git clone https://github.com/sgl-project/sglang.git
|
||||
cd sglang/docker
|
||||
|
||||
# Build the docker image
|
||||
docker build -t sglang-cpu:main -f Dockerfile.xeon .
|
||||
|
||||
# Initiate a docker container
|
||||
docker run \
|
||||
-it \
|
||||
--privileged \
|
||||
--ipc=host \
|
||||
--network=host \
|
||||
-v /dev/shm:/dev/shm \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 30000:30000 \
|
||||
-e "HF_TOKEN=<secret>" \
|
||||
sglang-cpu:main /bin/bash
|
||||
```
|
||||
|
||||
### Install From Source
|
||||
|
||||
If you'd prefer to install SGLang in a bare metal environment,
|
||||
the command list is as below.
|
||||
It is worth noting that the environment variable `SGLANG_USE_CPU_ENGINE=1`
|
||||
is required to enable SGLang service with CPU engine.
|
||||
|
||||
```bash
|
||||
# Create and activate a conda environment
|
||||
conda create -n sgl-cpu python=3.12 -y
|
||||
conda activate sgl-cpu
|
||||
|
||||
# Optional: Set PyTorch CPU as primary pip install channel to avoid installing CUDA version
|
||||
pip config set global.index-url https://download.pytorch.org/whl/cpu
|
||||
pip config set global.extra-index-url https://pypi.org/simple
|
||||
|
||||
# Check if some conda related environment variables have been set
|
||||
env | grep -i conda
|
||||
# The following environment variable settings are required
|
||||
# if they have not been set properly
|
||||
export CONDA_EXE=$(which conda)
|
||||
export CONDA_ROOT=${CONDA_EXE}/../..
|
||||
export CONDA_PREFIX=${CONDA_ROOT}/envs/sgl-cpu
|
||||
export PATH=${PATH}:${CONDA_ROOT}/bin:${CONDA_ROOT}/condabin
|
||||
|
||||
# Clone the SGLang code
|
||||
git clone https://github.com/sgl-project/sglang.git
|
||||
cd sglang
|
||||
git checkout <YOUR-DESIRED-VERSION>
|
||||
|
||||
# Install SGLang dependent libs, and build SGLang main package
|
||||
pip install --upgrade pip setuptools
|
||||
conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
|
||||
pip install -e "python[all_cpu]"
|
||||
pip install torch==2.7.1 torchvision==0.22.1 triton==3.3.1 --force-reinstall
|
||||
|
||||
# Build the CPU backend kernels
|
||||
cd sgl-kernel
|
||||
cp pyproject_cpu.toml pyproject.toml
|
||||
pip install .
|
||||
|
||||
# Other required environment variables
|
||||
# Recommend to set these in ~/.bashrc in order not to set every time in a new terminal
|
||||
export SGLANG_USE_CPU_ENGINE=1
|
||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so:${CONDA_PREFIX}/lib/libtcmalloc.so:${CONDA_PREFIX}/lib/libtbbmalloc.so.2
|
||||
```
|
||||
|
||||
## Launch of the Serving Engine
|
||||
|
||||
Example command to launch SGLang serving:
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server \
|
||||
--model <MODEL_ID_OR_PATH> \
|
||||
--trust-remote-code \
|
||||
--disable-overlap-schedule \
|
||||
--device cpu \
|
||||
--host 0.0.0.0 \
|
||||
--tp 6
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
1. For running W8A8 quantized models, please add the flag `--quantization w8a8_int8`.
|
||||
|
||||
2. The flag `--tp 6` specifies that tensor parallelism will be applied using 6 ranks (TP6).
|
||||
The number of TP specified is how many TP ranks will be used during the execution.
|
||||
In a CPU platform, a TP rank means a sub-NUMA cluster (SNC).
|
||||
Usually we can get the SNC information (How many available) from Operation System.
|
||||
User can specify TP to be no more than the total available SNCs in current system.
|
||||
|
||||
If the specified TP rank number differs from the total SNC count,
|
||||
the system will automatically utilize the first `n` SNCs.
|
||||
Note that `n` cannot exceed the total SNC number, doing so will result in an error.
|
||||
|
||||
To specify the cores to be used, we need to explicitly set the environment variable `SGLANG_CPU_OMP_THREADS_BIND`.
|
||||
For example, if we want to run the SGLang service using the first 40 cores of each SNC on a Xeon® 6980P server,
|
||||
which has 43-43-42 cores on the 3 SNCs of a socket, we should set:
|
||||
|
||||
```bash
|
||||
export SGLANG_CPU_OMP_THREADS_BIND="0-39|43-82|86-125|128-167|171-210|214-253"
|
||||
```
|
||||
|
||||
Please beware that with SGLANG_CPU_OMP_THREADS_BIND set,
|
||||
the available memory amounts of the ranks may not be determined in prior.
|
||||
You may need to set proper `--max-total-tokens` to avoid the out-of-memory error.
|
||||
|
||||
3. For optimizing decoding with torch.compile, please add the flag `--enable-torch-compile`.
|
||||
To specify the maximum batch size when using torch compile, set the flag `--torch-compile-max-bs`.
|
||||
For example, `--enable-torch-compile --torch-compile-max-bs 4` means using torch compile and setting the
|
||||
maximum batch size to 4.
|
||||
|
||||
4. A warmup step is automatically triggered when the service is started.
|
||||
The server is ready when you see the log `The server is fired up and ready to roll!`.
|
||||
|
||||
## Benchmarking with Requests
|
||||
|
||||
You can benchmark the performance via the `bench_serving` script.
|
||||
Run the command in another terminal.
|
||||
|
||||
```bash
|
||||
python -m sglang.bench_serving \
|
||||
--dataset-name random \
|
||||
--random-input-len 1024 \
|
||||
--random-output-len 1024 \
|
||||
--num-prompts 1 \
|
||||
--request-rate inf \
|
||||
--random-range-ratio 1.0
|
||||
```
|
||||
|
||||
The detail explanations of the parameters can be looked up by the command:
|
||||
|
||||
```bash
|
||||
python -m sglang.bench_serving -h
|
||||
```
|
||||
|
||||
Additionally, the requests can be formed with
|
||||
[OpenAI Completions API](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
|
||||
and sent via the command line (e.g. using `curl`) or via your own script.
|
||||
|
||||
## Example: Running DeepSeek-R1
|
||||
|
||||
An example command to launch service for W8A8 DeepSeek-R1 on a Xeon® 6980P server
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server \
|
||||
--model meituan/DeepSeek-R1-Channel-INT8 \
|
||||
--trust-remote-code \
|
||||
--disable-overlap-schedule \
|
||||
--device cpu \
|
||||
--quantization w8a8_int8 \
|
||||
--host 0.0.0.0 \
|
||||
--mem-fraction-static 0.8 \
|
||||
--tp 6
|
||||
```
|
||||
|
||||
Similarly, an example command to launch service for FP8 DeepSeek-R1 would be
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server \
|
||||
--model deepseek-ai/DeepSeek-R1 \
|
||||
--trust-remote-code \
|
||||
--disable-overlap-schedule \
|
||||
--device cpu \
|
||||
--host 0.0.0.0 \
|
||||
--mem-fraction-static 0.8 \
|
||||
--tp 6
|
||||
```
|
||||
|
||||
Then you can test with `bench_serving` command or construct your own command or script
|
||||
following [the benchmarking example](#benchmarking-with-requests).
|
||||
80
docs/platforms/nvidia_jetson.md
Normal file
80
docs/platforms/nvidia_jetson.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# NVIDIA Jetson Orin
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure the following:
|
||||
|
||||
- [**NVIDIA Jetson AGX Orin Devkit**](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/) is set up with **JetPack 6.1** or later.
|
||||
- **CUDA Toolkit** and **cuDNN** are installed.
|
||||
- Verify that the Jetson AGX Orin is in **high-performance mode**:
|
||||
```bash
|
||||
sudo nvpmodel -m 0
|
||||
```
|
||||
* * * * *
|
||||
## Installing and running SGLang with Jetson Containers
|
||||
Clone the jetson-containers github repository:
|
||||
```
|
||||
git clone https://github.com/dusty-nv/jetson-containers.git
|
||||
```
|
||||
Run the installation script:
|
||||
```
|
||||
bash jetson-containers/install.sh
|
||||
```
|
||||
Build the container image:
|
||||
```
|
||||
jetson-containers build sglang
|
||||
```
|
||||
Run the container:
|
||||
```
|
||||
jetson-containers run $(autotag sglang)
|
||||
```
|
||||
Or you can also manually run a container with this command:
|
||||
```
|
||||
docker run --runtime nvidia -it --rm --network=host IMAGE_NAME
|
||||
```
|
||||
* * * * *
|
||||
|
||||
Running Inference
|
||||
-----------------------------------------
|
||||
|
||||
Launch the server:
|
||||
```bash
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
|
||||
--device cuda \
|
||||
--dtype half \
|
||||
--attention-backend flashinfer \
|
||||
--mem-fraction-static 0.8 \
|
||||
--context-length 8192
|
||||
```
|
||||
The quantization and limited context length (`--dtype half --context-length 8192`) are due to the limited computational resources in [Nvidia jetson kit](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/). A detailed explanation can be found in [Server Arguments](../backend/server_arguments.md).
|
||||
|
||||
After launching the engine, refer to [Chat completions](https://docs.sglang.ai/backend/openai_api_completions.html#Usage) to test the usability.
|
||||
* * * * *
|
||||
Running quantization with TorchAO
|
||||
-------------------------------------
|
||||
TorchAO is suggested to NVIDIA Jetson Orin.
|
||||
```bash
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--device cuda \
|
||||
--dtype bfloat16 \
|
||||
--attention-backend flashinfer \
|
||||
--mem-fraction-static 0.8 \
|
||||
--context-length 8192 \
|
||||
--torchao-config int4wo-128
|
||||
```
|
||||
This enables TorchAO's int4 weight-only quantization with a 128-group size. The usage of `--torchao-config int4wo-128` is also for memory efficiency.
|
||||
|
||||
|
||||
* * * * *
|
||||
Structured output with XGrammar
|
||||
-------------------------------
|
||||
Please refer to [SGLang doc structured output](../advanced_features/structured_outputs.ipynb).
|
||||
* * * * *
|
||||
|
||||
Thanks to the support from [Nurgaliyev Shakhizat](https://github.com/shahizat), [Dustin Franklin](https://github.com/dusty-nv) and [Johnny Núñez Cano](https://github.com/johnnynunez).
|
||||
|
||||
References
|
||||
----------
|
||||
- [NVIDIA Jetson AGX Orin Documentation](https://developer.nvidia.com/embedded/jetson-agx-orin)
|
||||
3
docs/platforms/tpu.md
Normal file
3
docs/platforms/tpu.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# TPU
|
||||
|
||||
The support for TPU is under active development. Please stay tuned.
|
||||
42
docs/references/custom_chat_template.md
Normal file
42
docs/references/custom_chat_template.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Custom Chat Template
|
||||
|
||||
**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
|
||||
|
||||
By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
|
||||
It should just work for most official models such as Llama-2/Llama-3.
|
||||
|
||||
If needed, you can also override the chat template when launching the server:
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
|
||||
```
|
||||
|
||||
If the chat template you are looking for is missing, you are welcome to contribute it or load it from a file.
|
||||
|
||||
## JSON Format
|
||||
|
||||
You can load the JSON format, which is defined by `conversation.py`.
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "my_model",
|
||||
"system": "<|im_start|>system",
|
||||
"user": "<|im_start|>user",
|
||||
"assistant": "<|im_start|>assistant",
|
||||
"sep_style": "CHATML",
|
||||
"sep": "<|im_end|>",
|
||||
"stop_str": ["<|im_end|>", "<|im_start|>"]
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.json
|
||||
```
|
||||
|
||||
## Jinja Format
|
||||
|
||||
You can also use the [Jinja template format](https://huggingface.co/docs/transformers/main/en/chat_templating) as defined by Hugging Face Transformers.
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.jinja
|
||||
```
|
||||
97
docs/references/environment_variables.md
Normal file
97
docs/references/environment_variables.md
Normal file
@@ -0,0 +1,97 @@
|
||||
# Environment Variables
|
||||
|
||||
SGLang supports various environment variables that can be used to configure its runtime behavior. This document provides a comprehensive list and aims to stay updated over time.
|
||||
|
||||
*Note: SGLang uses two prefixes for environment variables: `SGL_` and `SGLANG_`. This is likely due to historical reasons. While both are currently supported for different settings, future versions might consolidate them.*
|
||||
|
||||
## General Configuration
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` |
|
||||
| `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` |
|
||||
| `SGLANG_PORT` | Port for the server | auto-detected |
|
||||
| `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set |
|
||||
| `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` |
|
||||
| `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` |
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_ENABLE_TORCH_INFERENCE_MODE` | Control whether to use torch.inference_mode | `false` |
|
||||
| `SGLANG_ENABLE_TORCH_COMPILE` | Enable torch.compile | `true` |
|
||||
| `SGLANG_SET_CPU_AFFINITY` | Enable CPU affinity setting (often set to `1` in Docker builds) | `0` |
|
||||
| `SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN` | Allows the scheduler to overwrite longer context length requests (often set to `1` in Docker builds) | `0` |
|
||||
| `SGLANG_IS_FLASHINFER_AVAILABLE` | Control FlashInfer availability check | `true` |
|
||||
| `SGLANG_SKIP_P2P_CHECK` | Skip P2P (peer-to-peer) access check | `false` |
|
||||
| `SGL_CHUNKED_PREFIX_CACHE_THRESHOLD` | Sets the threshold for enabling chunked prefix caching | `8192` |
|
||||
| `SGLANG_FUSED_MLA_ENABLE_ROPE_FUSION` | Enable RoPE fusion in Fused Multi-Layer Attention | `1` |
|
||||
|
||||
## DeepGEMM Configuration (Advanced Optimization)
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGL_ENABLE_JIT_DEEPGEMM` | Enable Just-In-Time compilation of DeepGEMM kernels | `"true"` |
|
||||
| `SGL_JIT_DEEPGEMM_PRECOMPILE` | Enable precompilation of DeepGEMM kernels | `"true"` |
|
||||
| `SGL_JIT_DEEPGEMM_COMPILE_WORKERS` | Number of workers for parallel DeepGEMM kernel compilation | `4` |
|
||||
| `SGL_IN_DEEPGEMM_PRECOMPILE_STAGE` | Indicator flag used during the DeepGEMM precompile script | `"false"` |
|
||||
| `SGL_DG_CACHE_DIR` | Directory for caching compiled DeepGEMM kernels | `~/.cache/deep_gemm` |
|
||||
| `SGL_DG_USE_NVRTC` | Use NVRTC (instead of Triton) for JIT compilation (Experimental) | `"0"` |
|
||||
| `SGL_USE_DEEPGEMM_BMM` | Use DeepGEMM for Batched Matrix Multiplication (BMM) operations | `"false"` |
|
||||
|
||||
## Memory Management
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_DEBUG_MEMORY_POOL` | Enable memory pool debugging | `false` |
|
||||
| `SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION` | Clip max new tokens estimation for memory planning | `4096` |
|
||||
| `SGLANG_DETOKENIZER_MAX_STATES` | Maximum states for detokenizer | Default value based on system |
|
||||
| `SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK` | Disable checks for memory imbalance across Tensor Parallel ranks | Not set (defaults to enabled check) |
|
||||
|
||||
## Model-Specific Options
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_USE_AITER` | Use AITER optimize implementation | `false` |
|
||||
| `SGLANG_INT4_WEIGHT` | Enable INT4 weight quantization | `false` |
|
||||
| `SGLANG_MOE_PADDING` | Enable MoE padding (sets padding size to 128 if value is `1`, often set to `1` in Docker builds) | `0` |
|
||||
| `SGLANG_FORCE_FP8_MARLIN` | Force using FP8 MARLIN kernels even if other FP8 kernels are available | `false` |
|
||||
| `SGLANG_ENABLE_FLASHINFER_GEMM` | Use flashinfer kernels when running blockwise fp8 GEMM on Blackwell GPUs | `false` |
|
||||
| `SGLANG_SUPPORT_CUTLASS_BLOCK_FP8` | Use Cutlass kernels when running blockwise fp8 GEMM on Hopper or Blackwell GPUs | `false` |
|
||||
| `SGLANG_CUTLASS_MOE` | Use Cutlass FP8 MoE kernel on Blackwell GPUs | `false` |
|
||||
|
||||
|
||||
## Distributed Computing
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_BLOCK_NONZERO_RANK_CHILDREN` | Control blocking of non-zero rank children processes | `1` |
|
||||
| `SGL_IS_FIRST_RANK_ON_NODE` | Indicates if the current process is the first rank on its node | `"true"` |
|
||||
| `SGLANG_PP_LAYER_PARTITION` | Pipeline parallel layer partition specification | Not set |
|
||||
|
||||
## Testing & Debugging (Internal/CI)
|
||||
|
||||
*These variables are primarily used for internal testing, continuous integration, or debugging.*
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_IS_IN_CI` | Indicates if running in CI environment | `false` |
|
||||
| `SGLANG_AMD_CI` | Indicates running in AMD CI environment | `0` |
|
||||
| `SGLANG_TEST_RETRACT` | Enable retract decode testing | `false` |
|
||||
| `SGLANG_RECORD_STEP_TIME` | Record step time for profiling | `false` |
|
||||
| `SGLANG_TEST_REQUEST_TIME_STATS` | Test request time statistics | `false` |
|
||||
| `SGLANG_CI_SMALL_KV_SIZE` | Use small KV cache size in CI | Not set |
|
||||
|
||||
## Profiling & Benchmarking
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_TORCH_PROFILER_DIR` | Directory for PyTorch profiler output | `/tmp` |
|
||||
| `SGLANG_PROFILE_WITH_STACK` | Set `with_stack` option (bool) for PyTorch profiler (capture stack trace) | `true` |
|
||||
|
||||
## Storage & Caching
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
| --- | --- | --- |
|
||||
| `SGLANG_DISABLE_OUTLINES_DISK_CACHE` | Disable Outlines disk cache | `true` |
|
||||
35
docs/references/faq.md
Normal file
35
docs/references/faq.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Troubleshooting and Frequently Asked Questions
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
This page lists common errors and tips for resolving them.
|
||||
|
||||
### CUDA Out of Memory
|
||||
If you encounter out-of-memory (OOM) errors, you can adjust the following parameters:
|
||||
|
||||
- If OOM occurs during prefill, try reducing `--chunked-prefill-size` to `4096` or `2048`. This saves memory but slows down the prefill speed for long prompts.
|
||||
- If OOM occurs during decoding, try lowering `--max-running-requests`.
|
||||
- You can also reduce `--mem-fraction-static` to a smaller value, such as 0.8 or 0.7. This decreases the memory usage of the KV cache memory pool and helps prevent OOM errors during both prefill and decoding. However, it limits maximum concurrency and reduces peak throughput.
|
||||
- Another common case for OOM is requesting input logprobs for a long prompt as it requires significant memory. To address this, set `logprob_start_len` in your sampling parameters to include only the necessary parts. If you do need input logprobs for a long prompt, try reducing `--mem-fraction-static`.
|
||||
|
||||
### CUDA Error: Illegal Memory Access Encountered
|
||||
This error may result from kernel errors or out-of-memory issues:
|
||||
- If it is a kernel error, resolving it may be challenging. Please file an issue on GitHub.
|
||||
- If it is an out-of-memory issue, it may sometimes be reported as this error instead of "Out of Memory." Refer to the section above for guidance on avoiding OOM issues.
|
||||
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
### The results are not deterministic, even with a temperature of 0
|
||||
|
||||
You may notice that when you send the same request twice, the results from the engine will be slightly different, even when the temperature is set to 0.
|
||||
|
||||
From our initial investigation, this indeterminism arises from two factors: dynamic batching and prefix caching. Roughly speaking, dynamic batching accounts for about 95% of the indeterminism, while prefix caching accounts for the remaining portion. The server runs dynamic batching under the hood. Different batch sizes can cause PyTorch/CuBLAS to dispatch to different CUDA kernels, which can lead to slight numerical differences. This difference accumulates across many layers, resulting in nondeterministic output when the batch size changes. Similarly, when prefix caching is enabled, it can also dispatch to different kernels. Even when the computations are mathematically equivalent, small numerical differences from different kernel implementations lead to the final nondeterministic outputs.
|
||||
|
||||
To achieve more deterministic outputs in the current code, you can add `--disable-radix-cache` and send only one request at a time. The results will be mostly deterministic under this setting.
|
||||
|
||||
We are still investigating the root causes and potential solutions. In the short term, we may introduce a "deterministic mode" that uses more padding to address the variance caused by dynamic batching. This mode will be more deterministic but slower.
|
||||
|
||||
We have two issues to track our progress:
|
||||
- The deterministic mode is tracked at [https://github.com/sgl-project/sglang/issues/1729](https://github.com/sgl-project/sglang/issues/1729).
|
||||
- The per-request random seed is tracked at [https://github.com/sgl-project/sglang/issues/1335](https://github.com/sgl-project/sglang/issues/1335).
|
||||
77
docs/references/frontend/choices_methods.md
Normal file
77
docs/references/frontend/choices_methods.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Choices Methods in SGLang
|
||||
This doc describes the choices methods supported by SGLang.
|
||||
|
||||
The optional `choices_method` arg determines how options supplied to SGLang's `choices` primitive are selected. Only the `RuntimeEndpoint` backend supports the `choices_method` arg. Other backends, such as `OpenAI`, have bespoke selection implementations due to API limitations.
|
||||
|
||||
## Methods
|
||||
|
||||
### Token Length Normalized
|
||||
|
||||
Token length normalized is the default SGLang choices method. It selects the option with the highest average logprob across all of its tokens.
|
||||
|
||||
Usage example (alternatively, simply omit the `choices_method` arg):
|
||||
```python
|
||||
@sgl.function
|
||||
def example(s):
|
||||
s += sgl.user("What is the capital of France?")
|
||||
s += sgl.assistant(
|
||||
sgl.gen(
|
||||
"answer",
|
||||
choices=["London", "Paris", "Berlin"],
|
||||
choices_method=sgl.token_length_normalized,
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
This can perform poorly if an option contains many tokens, where its later tokens are predicted with high confidence based on its earlier tokens. For instance, even strong models will fail the above example if the specified options are `["Paris", "Antidisestablishmentarianism"]`.
|
||||
|
||||
### Greedy Token Selection
|
||||
|
||||
Greedy token selection simply selects the option with the highest logprob for its initial token. For overlapping options where one option is a subset of a longer option, the logprobs of the shorter option are extended using its average logprob for comparison against the longer option.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
@sgl.function
|
||||
def example(s):
|
||||
s += sgl.user("What is the capital of France?")
|
||||
s += sgl.assistant(
|
||||
sgl.gen(
|
||||
"answer",
|
||||
choices=["London", "Paris", "Berlin"],
|
||||
choices_method=sgl.greedy_token_selection,
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
This can perform poorly if an option misleads the model down a bad path based on an attractive initial token. For instance, greedy selection will result in an incorrect response for this example:
|
||||
```python
|
||||
@sgl.function
|
||||
def us_president_example(s):
|
||||
s += sgl.user("Name a US president.")
|
||||
s += sgl.assistant(
|
||||
sgl.gen(
|
||||
"answer",
|
||||
choices=["Donald Duck", "Millard Fillmore"],
|
||||
choices_method=sgl.greedy_token_selection,
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Unconditional Likelihood Normalized
|
||||
|
||||
Unconditional likelihood normalized selects the option with the highest average token logprob once normalized by the unconditional token logprobs, as described in [this EleutherAI blogpost](https://blog.eleuther.ai/multiple-choice-normalization/). This method incurs an additional LLM call to obtain the unconditional likelihoods.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
@sgl.function
|
||||
def example(s):
|
||||
s += sgl.user("What is the capital of France?")
|
||||
s += sgl.assistant(
|
||||
sgl.gen(
|
||||
"answer",
|
||||
choices=["London", "Paris", "Berlin"],
|
||||
choices_method=sgl.unconditional_likelihood_normalized,
|
||||
)
|
||||
)
|
||||
```
|
||||
9
docs/references/frontend/frontend_index.rst
Normal file
9
docs/references/frontend/frontend_index.rst
Normal file
@@ -0,0 +1,9 @@
|
||||
Frontend Language
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Frontend Language
|
||||
|
||||
frontend_tutorial.ipynb
|
||||
choices_methods.md
|
||||
456
docs/references/frontend/frontend_tutorial.ipynb
Normal file
456
docs/references/frontend/frontend_tutorial.ipynb
Normal file
@@ -0,0 +1,456 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# SGLang Frontend Language"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"SGLang frontend language can be used to define simple and easy prompts in a convenient, structured way."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launch A Server\n",
|
||||
"\n",
|
||||
"Launch the server in your terminal and wait for it to initialize."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang import assistant_begin, assistant_end\n",
|
||||
"from sglang import assistant, function, gen, system, user\n",
|
||||
"from sglang import image\n",
|
||||
"from sglang import RuntimeEndpoint\n",
|
||||
"from sglang.lang.api import set_default_backend\n",
|
||||
"from sglang.srt.utils import load_image\n",
|
||||
"from sglang.test.doc_patch import launch_server_cmd\n",
|
||||
"from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")\n",
|
||||
"print(f\"Server started on http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Set the default backend. Note: Besides the local server, you may use also `OpenAI` or other API endpoints."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Basic Usage\n",
|
||||
"\n",
|
||||
"The most simple way of using SGLang frontend language is a simple question answer dialog between a user and an assistant."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def basic_qa(s, question):\n",
|
||||
" s += system(f\"You are a helpful assistant than can answer questions.\")\n",
|
||||
" s += user(question)\n",
|
||||
" s += assistant(gen(\"answer\", max_tokens=512))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"state = basic_qa(\"List 3 countries and their capitals.\")\n",
|
||||
"print_highlight(state[\"answer\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Multi-turn Dialog\n",
|
||||
"\n",
|
||||
"SGLang frontend language can also be used to define multi-turn dialogs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def multi_turn_qa(s):\n",
|
||||
" s += system(f\"You are a helpful assistant than can answer questions.\")\n",
|
||||
" s += user(\"Please give me a list of 3 countries and their capitals.\")\n",
|
||||
" s += assistant(gen(\"first_answer\", max_tokens=512))\n",
|
||||
" s += user(\"Please give me another list of 3 countries and their capitals.\")\n",
|
||||
" s += assistant(gen(\"second_answer\", max_tokens=512))\n",
|
||||
" return s\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state = multi_turn_qa()\n",
|
||||
"print_highlight(state[\"first_answer\"])\n",
|
||||
"print_highlight(state[\"second_answer\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Control flow\n",
|
||||
"\n",
|
||||
"You may use any Python code within the function to define more complex control flows."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def tool_use(s, question):\n",
|
||||
" s += assistant(\n",
|
||||
" \"To answer this question: \"\n",
|
||||
" + question\n",
|
||||
" + \". I need to use a \"\n",
|
||||
" + gen(\"tool\", choices=[\"calculator\", \"search engine\"])\n",
|
||||
" + \". \"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if s[\"tool\"] == \"calculator\":\n",
|
||||
" s += assistant(\"The math expression is: \" + gen(\"expression\"))\n",
|
||||
" elif s[\"tool\"] == \"search engine\":\n",
|
||||
" s += assistant(\"The key word to search is: \" + gen(\"word\"))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state = tool_use(\"What is 2 * 2?\")\n",
|
||||
"print_highlight(state[\"tool\"])\n",
|
||||
"print_highlight(state[\"expression\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Parallelism\n",
|
||||
"\n",
|
||||
"Use `fork` to launch parallel prompts. Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def tip_suggestion(s):\n",
|
||||
" s += assistant(\n",
|
||||
" \"Here are two tips for staying healthy: \"\n",
|
||||
" \"1. Balanced Diet. 2. Regular Exercise.\\n\\n\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" forks = s.fork(2)\n",
|
||||
" for i, f in enumerate(forks):\n",
|
||||
" f += assistant(\n",
|
||||
" f\"Now, expand tip {i+1} into a paragraph:\\n\"\n",
|
||||
" + gen(\"detailed_tip\", max_tokens=256, stop=\"\\n\\n\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" s += assistant(\"Tip 1:\" + forks[0][\"detailed_tip\"] + \"\\n\")\n",
|
||||
" s += assistant(\"Tip 2:\" + forks[1][\"detailed_tip\"] + \"\\n\")\n",
|
||||
" s += assistant(\n",
|
||||
" \"To summarize the above two tips, I can say:\\n\" + gen(\"summary\", max_tokens=512)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state = tip_suggestion()\n",
|
||||
"print_highlight(state[\"summary\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Constrained Decoding\n",
|
||||
"\n",
|
||||
"Use `regex` to specify a regular expression as a decoding constraint. This is only supported for local models."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def regular_expression_gen(s):\n",
|
||||
" s += user(\"What is the IP address of the Google DNS servers?\")\n",
|
||||
" s += assistant(\n",
|
||||
" gen(\n",
|
||||
" \"answer\",\n",
|
||||
" temperature=0,\n",
|
||||
" regex=r\"((25[0-5]|2[0-4]\\d|[01]?\\d\\d?).){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\",\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state = regular_expression_gen()\n",
|
||||
"print_highlight(state[\"answer\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Use `regex` to define a `JSON` decoding schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"character_regex = (\n",
|
||||
" r\"\"\"\\{\\n\"\"\"\n",
|
||||
" + r\"\"\" \"name\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"house\": \"(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"blood status\": \"(Pure-blood|Half-blood|Muggle-born)\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"occupation\": \"(student|teacher|auror|ministry of magic|death eater|order of the phoenix)\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"wand\": \\{\\n\"\"\"\n",
|
||||
" + r\"\"\" \"wood\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"core\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"length\": [0-9]{1,2}\\.[0-9]{0,2}\\n\"\"\"\n",
|
||||
" + r\"\"\" \\},\\n\"\"\"\n",
|
||||
" + r\"\"\" \"alive\": \"(Alive|Deceased)\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"patronus\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
|
||||
" + r\"\"\" \"bogart\": \"[\\w\\d\\s]{1,16}\"\\n\"\"\"\n",
|
||||
" + r\"\"\"\\}\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@function\n",
|
||||
"def character_gen(s, name):\n",
|
||||
" s += user(\n",
|
||||
" f\"{name} is a character in Harry Potter. Please fill in the following information about this character.\"\n",
|
||||
" )\n",
|
||||
" s += assistant(gen(\"json_output\", max_tokens=256, regex=character_regex))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state = character_gen(\"Harry Potter\")\n",
|
||||
"print_highlight(state[\"json_output\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Batching \n",
|
||||
"\n",
|
||||
"Use `run_batch` to run a batch of prompts."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def text_qa(s, question):\n",
|
||||
" s += user(question)\n",
|
||||
" s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"states = text_qa.run_batch(\n",
|
||||
" [\n",
|
||||
" {\"question\": \"What is the capital of the United Kingdom?\"},\n",
|
||||
" {\"question\": \"What is the capital of France?\"},\n",
|
||||
" {\"question\": \"What is the capital of Japan?\"},\n",
|
||||
" ],\n",
|
||||
" progress_bar=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for i, state in enumerate(states):\n",
|
||||
" print_highlight(f\"Answer {i+1}: {states[i]['answer']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Streaming \n",
|
||||
"\n",
|
||||
"Use `stream` to stream the output to the user."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def text_qa(s, question):\n",
|
||||
" s += user(question)\n",
|
||||
" s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state = text_qa.run(\n",
|
||||
" question=\"What is the capital of France?\", temperature=0.1, stream=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for out in state.text_iter():\n",
|
||||
" print(out, end=\"\", flush=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Complex Prompts\n",
|
||||
"\n",
|
||||
"You may use `{system|user|assistant}_{begin|end}` to define complex prompts."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def chat_example(s):\n",
|
||||
" s += system(\"You are a helpful assistant.\")\n",
|
||||
" # Same as: s += s.system(\"You are a helpful assistant.\")\n",
|
||||
"\n",
|
||||
" with s.user():\n",
|
||||
" s += \"Question: What is the capital of France?\"\n",
|
||||
"\n",
|
||||
" s += assistant_begin()\n",
|
||||
" s += \"Answer: \" + gen(\"answer\", max_tokens=100, stop=\"\\n\")\n",
|
||||
" s += assistant_end()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state = chat_example()\n",
|
||||
"print_highlight(state[\"answer\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Multi-modal Generation\n",
|
||||
"\n",
|
||||
"You may use SGLang frontend language to define multi-modal prompts.\n",
|
||||
"See [here](https://docs.sglang.ai/supported_models/generative_models.html) for supported models."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")\n",
|
||||
"print(f\"Server started on http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Ask a question about an image."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@function\n",
|
||||
"def image_qa(s, image_file, question):\n",
|
||||
" s += user(image(image_file) + question)\n",
|
||||
" s += assistant(gen(\"answer\", max_tokens=256))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"image_url = \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
"image_bytes, _ = load_image(image_url)\n",
|
||||
"state = image_qa(image_bytes, \"What is in the image?\")\n",
|
||||
"print_highlight(state[\"answer\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
7
docs/references/learn_more.md
Normal file
7
docs/references/learn_more.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Learn more
|
||||
|
||||
You can find more blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials).
|
||||
|
||||
The latest SGLang features and updates are shared through the [LMSYS blog](https://lmsys.org/blog/).
|
||||
|
||||
The 2025 H2 roadmap can be found at this [issue](https://github.com/sgl-project/sglang/issues/7736).
|
||||
337
docs/references/multi_node_deployment/deploy_on_k8s.md
Normal file
337
docs/references/multi_node_deployment/deploy_on_k8s.md
Normal file
@@ -0,0 +1,337 @@
|
||||
# Deploy On Kubernetes
|
||||
|
||||
This document is for deploying a RoCE network-based SGLang two-node inference service on a Kubernetes (K8S) cluster.
|
||||
|
||||
[LeaderWorkerSet (LWS)](https://github.com/kubernetes-sigs/lws) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference.
|
||||
|
||||
SGLang can also be deployed with LWS on Kubernetes for distributed model serving.
|
||||
|
||||
Please see this guide for more details on deploying SGLang on Kubernetes using LWS.
|
||||
|
||||
Here we take the deployment of DeepSeek-R1 as an example.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. At least two Kubernetes nodes, each with two H20 systems and eight GPUs, are required.
|
||||
|
||||
2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md). **Note:** For LWS versions ≤0.5.x, you must use the Downward API to obtain `LWS_WORKER_INDEX`, as native support for this feature was introduced in v0.6.0.
|
||||
|
||||
## Basic example
|
||||
|
||||
For the basic example documentation, refer to [Deploy Distributed Inference Service with SGLang and LWS on GPUs](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/sglang).
|
||||
|
||||
However, that document only covers the basic NCCL socket mode.
|
||||
|
||||
In this section, we’ll make some simple modifications to adapt the setup to the RDMA scenario.
|
||||
|
||||
## RDMA RoCE case
|
||||
|
||||
* Check your env:
|
||||
|
||||
```bash
|
||||
[root@node1 ~]# ibstatus
|
||||
Infiniband device 'mlx5_bond_0' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe64:c79a
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
|
||||
Infiniband device 'mlx5_bond_1' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe6e:c3ec
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
|
||||
Infiniband device 'mlx5_bond_2' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe73:0dd7
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
|
||||
Infiniband device 'mlx5_bond_3' port 1 status:
|
||||
default gid: fe80:0000:0000:0000:0225:9dff:fe36:f7ff
|
||||
base lid: 0x0
|
||||
sm lid: 0x0
|
||||
state: 4: ACTIVE
|
||||
phys state: 5: LinkUp
|
||||
rate: 200 Gb/sec (2X NDR)
|
||||
link_layer: Ethernet
|
||||
```
|
||||
|
||||
* Prepare the `lws.yaml` file for deploying on k8s.
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: sglang
|
||||
spec:
|
||||
replicas: 1
|
||||
leaderWorkerTemplate:
|
||||
size: 2
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
containers:
|
||||
- name: sglang-leader
|
||||
image: sglang:latest
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "3"
|
||||
command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --mem-fraction-static
|
||||
- "0.93"
|
||||
- --torch-compile-max-bs
|
||||
- "8"
|
||||
- --max-running-requests
|
||||
- "20"
|
||||
- --tp
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20000
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --port
|
||||
- "40000"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
ports:
|
||||
- containerPort: 40000
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 40000
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: model
|
||||
mountPath: /work/models
|
||||
- name: ib
|
||||
mountPath: /dev/infiniband
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: model
|
||||
hostPath:
|
||||
path: '< your models dir >' # modify it according your models dir
|
||||
- name: ib
|
||||
hostPath:
|
||||
path: /dev/infiniband
|
||||
workerTemplate:
|
||||
spec:
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
containers:
|
||||
- name: sglang-worker
|
||||
image: sglang:latest
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "3"
|
||||
command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --mem-fraction-static
|
||||
- "0.93"
|
||||
- --torch-compile-max-bs
|
||||
- "8"
|
||||
- --max-running-requests
|
||||
- "20"
|
||||
- --tp
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20000
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: model
|
||||
mountPath: /work/models
|
||||
- name: ib
|
||||
mountPath: /dev/infiniband
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: ib
|
||||
hostPath:
|
||||
path: /dev/infiniband
|
||||
- name: model
|
||||
hostPath:
|
||||
path: /data1/models/deepseek_v3_moe
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: sglang-leader
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: sglang
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 40000
|
||||
targetPort: 40000
|
||||
|
||||
```
|
||||
|
||||
* Then use `kubectl apply -f lws.yaml` you will get this output.
|
||||
|
||||
```text
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
sglang-0 0/1 Running 0 9s
|
||||
sglang-0-1 1/1 Running 0 9s
|
||||
```
|
||||
|
||||
Wait for the sglang leader (`sglang-0`) status to change to 1/1, which indicates it is `Ready`.
|
||||
|
||||
You can use the command `kubectl logs -f sglang-0` to view the logs of the leader node.
|
||||
|
||||
Once successful, you should see output like this:
|
||||
|
||||
```text
|
||||
[2025-02-17 05:27:24 TP1] Capture cuda graph end. Time elapsed: 84.89 s
|
||||
[2025-02-17 05:27:24 TP6] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP0] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP7] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP3] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP2] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP4] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP1] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24 TP5] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
|
||||
[2025-02-17 05:27:24] INFO: Started server process [1]
|
||||
[2025-02-17 05:27:24] INFO: Waiting for application startup.
|
||||
[2025-02-17 05:27:24] INFO: Application startup complete.
|
||||
[2025-02-17 05:27:24] INFO: Uvicorn running on http://0.0.0.0:40000 (Press CTRL+C to quit)
|
||||
[2025-02-17 05:27:25] INFO: 127.0.0.1:48908 - "GET /get_model_info HTTP/1.1" 200 OK
|
||||
[2025-02-17 05:27:25 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
|
||||
[2025-02-17 05:27:32] INFO: 127.0.0.1:48924 - "POST /generate HTTP/1.1" 200 OK
|
||||
[2025-02-17 05:27:32] The server is fired up and ready to roll!
|
||||
```
|
||||
|
||||
If it doesn’t start up successfully, please follow these steps to check for any remaining issues. Thanks!
|
||||
|
||||
### Debug
|
||||
|
||||
* Set `NCCL_DEBUG=TRACE` to check if it is a NCCL communication problem.
|
||||
|
||||
This should resolve most NCCL-related issues.
|
||||
|
||||
***Notice: If you find that NCCL_DEBUG=TRACE is not effective in the container environment, but the process is stuck or you encounter hard-to-diagnose issues, try switching to a different container image. Some images may not handle standard error output properly.***
|
||||
|
||||
#### RoCE scenario
|
||||
|
||||
* Please make sure that RDMA devices are available in the cluster environment.
|
||||
* Please make sure that the nodes in the cluster have Mellanox NICs with RoCE. In this example, we use Mellanox ConnectX 5 model NICs, and the proper OFED driver has been installed. If not, please refer to the document [Install OFED Driver](https://docs.nvidia.com/networking/display/mlnxofedv461000/installing+mellanox+ofed) to install the driver.
|
||||
* Check your env:
|
||||
|
||||
```shell
|
||||
$ lspci -nn | grep Eth | grep Mellanox
|
||||
0000:7f:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0000:7f:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0000:c7:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0000:c7:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:08:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:08:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:a2:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
0001:a2:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
|
||||
```
|
||||
|
||||
* Check the OFED driver:
|
||||
|
||||
```shell
|
||||
ofed_info -s
|
||||
OFED-internal-23.07-0.5.0:
|
||||
```
|
||||
|
||||
* Show RDMA link status and check IB devices:
|
||||
|
||||
```shell
|
||||
$ rdma link show
|
||||
8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
|
||||
9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
|
||||
10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
|
||||
11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
|
||||
|
||||
$ ibdev2netdev
|
||||
8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
|
||||
9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
|
||||
10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
|
||||
11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
|
||||
```
|
||||
|
||||
* Test RoCE network speed on the host:
|
||||
|
||||
```shell
|
||||
yum install qperf
|
||||
# for server:
|
||||
execute qperf
|
||||
# for client
|
||||
qperf -t 60 -cm1 <server_ip> rc_rdma_write_bw
|
||||
```
|
||||
|
||||
* Check RDMA accessible in your container:
|
||||
|
||||
```shell
|
||||
# ibv_devices
|
||||
# ibv_devinfo
|
||||
```
|
||||
|
||||
## Keys to success
|
||||
|
||||
* In the YAML configuration above, pay attention to the NCCL environment variable. For older versions of NCCL, you should check the NCCL_IB_GID_INDEX environment setting.
|
||||
* NCCL_SOCKET_IFNAME is also crucial, but in a containerized environment, this typically isn’t an issue.
|
||||
* In some cases, it’s necessary to configure GLOO_SOCKET_IFNAME correctly.
|
||||
* NCCL_DEBUG is essential for troubleshooting, but I've found that sometimes it doesn't show error logs within containers. This could be related to the Docker image you're using. You may want to try switching images if needed.
|
||||
* Avoid using Docker images based on Ubuntu 18.04, as they tend to have compatibility issues.
|
||||
|
||||
## Remaining issues
|
||||
|
||||
* In Kubernetes, Docker, or Containerd environments, we use hostNetwork to prevent performance degradation.
|
||||
* We utilize privileged mode, which isn’t secure. Additionally, in containerized environments, full GPU isolation cannot be achieved.
|
||||
|
||||
## TODO
|
||||
|
||||
* Integrated with [k8s-rdma-shared-dev-plugin](https://github.com/Mellanox/k8s-rdma-shared-dev-plugin).
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
290
docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
Normal file
290
docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
Normal file
@@ -0,0 +1,290 @@
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
networkConfig:
|
||||
subdomainPolicy: Shared
|
||||
replicas: 1
|
||||
rolloutStrategy:
|
||||
rollingUpdateConfiguration:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
startupPolicy: LeaderCreated
|
||||
@@ -0,0 +1,56 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deepseekr10528-lb-main
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: deepseekr10528-lb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
nodeSelector:
|
||||
bo: "yes"
|
||||
tolerations:
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
containers:
|
||||
- name: sgl-minilb
|
||||
image: lmsysorg/sglang:latest
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
- sglang_router.launch_router
|
||||
- --pd-disaggregation
|
||||
- --prefill
|
||||
- http://deepseekr10528-prefill-main:30000
|
||||
- --decode
|
||||
- http://deepseekr10528-decode-main:30000
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-lb-service
|
||||
spec:
|
||||
type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
|
||||
selector:
|
||||
app: deepseekr10528-lb
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000 # Service Port(In-Cluster)
|
||||
targetPort: 8000 # Exposed Container
|
||||
nodePort: 30800
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
304
docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
Normal file
304
docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
Normal file
@@ -0,0 +1,304 @@
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
# should modify according your rdma env
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "false"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
# can be tuned using deepep test scripts
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: NVSHMEM_HCA_PE_MAPPING
|
||||
# should modify according your rdma env
|
||||
value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "8"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
|
||||
value: "0"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:latest
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
# should modify according your deployment env
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
# should modify according your deployment env
|
||||
- key: bopd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
783
docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
Normal file
783
docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
Normal file
@@ -0,0 +1,783 @@
|
||||
# LWS Based PD Deploy
|
||||
|
||||
## 0. Prerequisites
|
||||
|
||||
1. k8s >=1.26
|
||||
2. lws installed on k8s.
|
||||
|
||||
## 1. Image Preparation
|
||||
|
||||
`lmsysorg/sglang:deepep`
|
||||
|
||||
## 2. Deployment Manifest Files
|
||||
|
||||
***Notice: We will package all deployment files into Helm Chart format in the near future. Interested community members can contact us to contribute***
|
||||
|
||||
### Prefill
|
||||
|
||||
Prefill manifest file [prefill.yaml](lws-examples/p.yaml)
|
||||
|
||||
*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
# - --init-expert-location
|
||||
# - /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
# - name: NVSHMEM_HCA_PE_MAPPING
|
||||
# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
# - name: NVSHMEM_HCA_LIST
|
||||
# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "false"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --disaggregation-ib-device
|
||||
- mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
|
||||
- --chunked-prefill-size
|
||||
- "524288"
|
||||
- --max-prefill-tokens
|
||||
- "32768"
|
||||
- --page-size
|
||||
- "64"
|
||||
#- --init-expert-location
|
||||
#- /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
|
||||
- --ep-dispatch-algorithm
|
||||
- dynamic
|
||||
- --eplb-algorithm
|
||||
- deepseek
|
||||
# - --deepep-config
|
||||
# - /home/aiges/tuned/tuned_8sms.json
|
||||
- --enable-dp-lm-head
|
||||
- --enable-dp-attention
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --disable-radix-cache
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- prefill
|
||||
- --mem-fraction-static
|
||||
- "0.7"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --tp
|
||||
- "16"
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
- --max-running-requests
|
||||
- "1024"
|
||||
env:
|
||||
- name: SGLANG_SET_CPU_AFFINITY
|
||||
value: "true"
|
||||
- name: SGLANG_HACK_DEEPEP_NUM_SMS
|
||||
value: "8"
|
||||
- name: SGLANG_HACK_DEEPEP_NEW_MODE
|
||||
value: "0"
|
||||
# - name: NVSHMEM_HCA_PE_MAPPING
|
||||
# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
|
||||
# - name: NVSHMEM_HCA_LIST
|
||||
# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "8"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
|
||||
value: "0"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: none
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
|
||||
```
|
||||
|
||||
### Decode
|
||||
|
||||
Decode node deployment manifest file [decode.yaml](lws-examples/d.yaml)
|
||||
|
||||
*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
leaderWorkerTemplate:
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --port
|
||||
- "30000"
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: CUDA_LAUNCH_BLOCKING
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-leader
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 30000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
size: 2
|
||||
workerTemplate:
|
||||
metadata: {}
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- -m
|
||||
- sglang.launch_server
|
||||
- --model-path
|
||||
- /work/models
|
||||
- --chunked-prefill-size
|
||||
- "262144"
|
||||
- --page-size
|
||||
- "64"
|
||||
- --enable-dp-attention
|
||||
- --enable-dp-lm-head
|
||||
#- --enable-two-batch-overlap
|
||||
- --dp-size
|
||||
- "16"
|
||||
- --moe-a2a-backend
|
||||
- deepep
|
||||
- --disaggregation-mode
|
||||
- decode
|
||||
- --mem-fraction-static
|
||||
- "0.849"
|
||||
- --context-length
|
||||
- "32768"
|
||||
- --disaggregation-ib-device
|
||||
# should modify according your rdma env
|
||||
- "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
|
||||
- --cuda-graph-max-bs
|
||||
- "64"
|
||||
- --max-running-requests
|
||||
- "2048"
|
||||
- --tp-size
|
||||
- "16" # Size of Tensor Parallelism
|
||||
- --dist-init-addr
|
||||
- $(LWS_LEADER_ADDRESS):20102
|
||||
- --nnodes
|
||||
- $(LWS_GROUP_SIZE)
|
||||
- --node-rank
|
||||
- $(LWS_WORKER_INDEX)
|
||||
- --trust-remote-code
|
||||
- --ep-num-redundant-experts
|
||||
- "32"
|
||||
- --moe-dense-tp-size
|
||||
- "1"
|
||||
env:
|
||||
- name: SGLANG_HACK_DEEPEP_NUM_SMS
|
||||
value: "24"
|
||||
- name: SGLANG_HACK_DEEPEP_NEW_MODE
|
||||
value: "0"
|
||||
- name: NVSHMEM_IB_TRAFFIC_CLASS
|
||||
value: "16"
|
||||
- name: NVSHMEM_IB_GID_INDEX
|
||||
value: "3"
|
||||
- name: NVSHMEM_ENABLE_NIC_PE_MAPPING
|
||||
value: "1"
|
||||
- name: NCCL_IB_QPS_PER_CONNECTION
|
||||
value: "8"
|
||||
- name: NCCL_IB_SPLIT_DATA_ON_QPS
|
||||
value: "1"
|
||||
- name: NCCL_NET_PLUGIN
|
||||
value: "none"
|
||||
- name: NCCL_IB_TC
|
||||
value: "136"
|
||||
- name: NCCL_MIN_NCHANNELS
|
||||
value: "4"
|
||||
- name: MC_TE_METRIC
|
||||
value: "true"
|
||||
- name: NCCL_IB_SL
|
||||
value: "5"
|
||||
- name: SGLANG_MOONCAKE_TRANS_THREAD
|
||||
value: "16"
|
||||
- name: SGL_ENABLE_JIT_DEEPGEMM
|
||||
value: "1"
|
||||
- name: NCCL_IB_HCA
|
||||
value: ^=mlx5_0,mlx5_5,mlx5_6
|
||||
- name: LWS_WORKER_INDEX
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
|
||||
image: lmsysorg/sglang:deepep
|
||||
name: sglang-worker
|
||||
ports:
|
||||
- containerPort: 30001
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- IPC_LOCK
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: sgl-cache
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- mountPath: /work/models
|
||||
name: model
|
||||
- mountPath: /dev/infiniband
|
||||
name: ib
|
||||
- mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
name: cf
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostIPC: true
|
||||
hostNetwork: true
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /data1/sgl_cache1
|
||||
type: DirectoryOrCreate
|
||||
name: sgl-cache
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
name: dshm
|
||||
- hostPath:
|
||||
path: /dev/infiniband
|
||||
name: ib
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
|
||||
name: model
|
||||
- hostPath:
|
||||
# modify according to you deployment env
|
||||
path: /data1/maas_hosted_models/models/fused_moe_triton/configs
|
||||
name: cf
|
||||
networkConfig:
|
||||
subdomainPolicy: Shared
|
||||
replicas: 1
|
||||
rolloutStrategy:
|
||||
rollingUpdateConfiguration:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
startupPolicy: LeaderCreated
|
||||
```
|
||||
|
||||
Execute separately:
|
||||
|
||||
```bash
|
||||
kubectl apply -f p.yaml
|
||||
kubectl apply -f d.yaml
|
||||
```
|
||||
|
||||
At this point, we have completed the deployment of the 1P1D SGlang engine part.
|
||||
|
||||
To allow our users to directly experience the model API, we still need a load balancer to handle sequential calls between prefill and decode. Different companies implement LBs differently, and the community will also officially release a new LB component written in Rust in the near future.
|
||||
|
||||
Currently, we use a static K8S service + minilb approach to implement model API calls.
|
||||
|
||||
### Creating Service for Prefill and Decode
|
||||
|
||||
#### Create prefill k8s service
|
||||
[p-svc.yaml](lws-examples/p-svc.yaml)
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-prefill-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
```
|
||||
Execute `kubectl apply -f p-svc.yaml`
|
||||
|
||||
#### Create decode k8s service
|
||||
[d-svc.yaml](lws-examples/d-svc.yaml)
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-decode-main
|
||||
spec:
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
|
||||
role: leader
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 30000
|
||||
targetPort: 30000
|
||||
```
|
||||
Execute `kubectl apply -f d-svc.yaml`
|
||||
|
||||
#### Deploy minilb and lb service
|
||||
[lb.yaml](lws-examples/lb.yaml)
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deepseekr10528-lb-main
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: deepseekr10528-lb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: deepseekr10528-lb
|
||||
spec:
|
||||
nodeSelector:
|
||||
pd: "yes"
|
||||
tolerations:
|
||||
- key: pd
|
||||
operator: Exists
|
||||
- key: node-role
|
||||
operator: Exists
|
||||
containers:
|
||||
- name: sgl-minilb
|
||||
image: lmsysorg/sglang:deepep
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
- sglang_router.launch_router
|
||||
- --pd-disaggregation
|
||||
- --prefill
|
||||
- http://deepseekr10528-prefill-main:30000
|
||||
- --decode
|
||||
- http://deepseekr10528-decode-main:30000
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deepseekr10528-lb-service
|
||||
spec:
|
||||
type: NodePort
|
||||
selector:
|
||||
app: deepseekr10528-lb
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000 # Service Port(In-Cluster)
|
||||
targetPort: 8000 # Exposed Container
|
||||
nodePort: 30800
|
||||
```
|
||||
Execute `kubectl apply -f lb.yaml`
|
||||
|
||||
After waiting for all model deployments to succeed, you will get the following output:
|
||||
|
||||
```bash
|
||||
[root@ecs-001]# kubectl get po
|
||||
deepseekr10528-decode-main-0 1/1 Running 0 74m
|
||||
deepseekr10528-decode-main-0-1 1/1 Running 0 74m
|
||||
deepseekr10528-lb-main-9c5dbfc57-6lcbd 1/1 Running 0 22m
|
||||
deepseekr10528-prefill-main-0 1/1 Running 0 74m
|
||||
deepseekr10528-prefill-main-0-1 1/1 Running 0 74m
|
||||
[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl get svc |grep dee
|
||||
deepseekr10528-decode-main ClusterIP None <none> <none> 97m
|
||||
deepseekr10528-lb-service NodePort 172.16.242.169 <none> 8000:30800/TCP 22m
|
||||
deepseekr10528-prefill-main ClusterIP None <none> <none> 97m
|
||||
```
|
||||
|
||||
At this point, select a nodePort:30800 to access:
|
||||
|
||||
```bash
|
||||
[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \
|
||||
> -H "Content-Type: application/json" \
|
||||
> -H "Authorization: Bearer None" \
|
||||
> -d '{
|
||||
> "rid":"ccccdd",
|
||||
> "model": "r1",
|
||||
> "messages": [
|
||||
> {"role": "system", "content": "0: You are a helpful AI assistant"},
|
||||
> {"role": "user", "content": "你是谁?."}
|
||||
> ],
|
||||
> "max_tokens":221
|
||||
> }'
|
||||
{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"<think>\n嗯,用户问了一个很基础的自我介绍问题"你是谁?"。这可能是第一次互动时的常规开场白,也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息,语气简洁中性。这种场景下新用户的可能性较高,需要给出清晰友好的自我介绍,同时突出实用价值来降低陌生感。\n\n考虑到中文用户,应该用简体中文回复。重点要说明三点:身份归属(深度求索)、功能定位(AI助手)、服务范围(学习/工作/生活)。结尾用开放性问题引导对话很关键——既能了解需求,又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气,那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量,避免显得轻浮。\n</think>\n你好呀!我是你的AI助手,由深度求索公司(DeepSeek)开发的语言模型,名字叫 **DeepSeek-R1**。你可以把我当成一个知识丰富、随叫随到的小帮手~😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}}
|
||||
|
||||
```
|
||||
## FAQ
|
||||
|
||||
1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments.
|
||||
|
||||
2. Some preset, optimized configurations for EPLB are not used here. You can adjust them according to [6017](https://github.com/sgl-project/sglang/issues/6017) as needed.
|
||||
90
docs/references/multi_node_deployment/multi_node.md
Normal file
90
docs/references/multi_node_deployment/multi_node.md
Normal file
@@ -0,0 +1,90 @@
|
||||
# Multi-Node Deployment
|
||||
|
||||
## Llama 3.1 405B
|
||||
|
||||
**Run 405B (fp16) on Two Nodes**
|
||||
|
||||
```bash
|
||||
# replace 172.16.4.52:20000 with your own node ip address and port of the first node
|
||||
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
|
||||
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1
|
||||
```
|
||||
|
||||
Note that LLama 405B (fp8) can also be launched on a single node.
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
||||
```
|
||||
|
||||
## DeepSeek V3/R1
|
||||
|
||||
Please refer to [DeepSeek documents for reference](https://docs.sglang.ai/basic_usage/deepseek.html#running-examples-on-multi-node).
|
||||
|
||||
## Multi-Node Inference on SLURM
|
||||
|
||||
This example showcases how to serve SGLang server across multiple nodes by SLURM. Submit the following job to the SLURM cluster.
|
||||
|
||||
```
|
||||
#!/bin/bash -l
|
||||
|
||||
#SBATCH -o SLURM_Logs/%x_%j_master.out
|
||||
#SBATCH -e SLURM_Logs/%x_%j_master.err
|
||||
#SBATCH -D ./
|
||||
#SBATCH -J Llama-405B-Online-Inference-TP16-SGL
|
||||
|
||||
#SBATCH --nodes=2
|
||||
#SBATCH --ntasks=2
|
||||
#SBATCH --ntasks-per-node=1 # Ensure 1 task per node
|
||||
#SBATCH --cpus-per-task=18
|
||||
#SBATCH --mem=224GB
|
||||
#SBATCH --partition="lmsys.org"
|
||||
#SBATCH --gres=gpu:8
|
||||
#SBATCH --time=12:00:00
|
||||
|
||||
echo "[INFO] Activating environment on node $SLURM_PROCID"
|
||||
if ! source ENV_FOLDER/bin/activate; then
|
||||
echo "[ERROR] Failed to activate environment" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Define parameters
|
||||
model=MODEL_PATH
|
||||
tp_size=16
|
||||
|
||||
echo "[INFO] Running inference"
|
||||
echo "[INFO] Model: $model"
|
||||
echo "[INFO] TP Size: $tp_size"
|
||||
|
||||
# Set NCCL initialization address using the hostname of the head node
|
||||
HEAD_NODE=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
|
||||
NCCL_INIT_ADDR="${HEAD_NODE}:8000"
|
||||
echo "[INFO] NCCL_INIT_ADDR: $NCCL_INIT_ADDR"
|
||||
|
||||
# Launch the model server on each node using SLURM
|
||||
srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
|
||||
--error="SLURM_Logs/%x_%j_node$SLURM_NODEID.err" \
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path "$model" \
|
||||
--grammar-backend "xgrammar" \
|
||||
--tp "$tp_size" \
|
||||
--dist-init-addr "$NCCL_INIT_ADDR" \
|
||||
--nnodes 2 \
|
||||
--node-rank "$SLURM_NODEID" &
|
||||
|
||||
# Wait for the NCCL server to be ready on port 30000
|
||||
while ! nc -z "$HEAD_NODE" 30000; do
|
||||
sleep 1
|
||||
echo "[INFO] Waiting for $HEAD_NODE:30000 to accept connections"
|
||||
done
|
||||
|
||||
echo "[INFO] $HEAD_NODE:30000 is ready to accept connections"
|
||||
|
||||
# Keep the script running until the SLURM job times out
|
||||
wait
|
||||
```
|
||||
|
||||
Then, you can test the server by sending requests following other [documents](https://docs.sglang.ai/backend/openai_api_completions.html).
|
||||
|
||||
Thanks for [aflah02](https://github.com/aflah02) for providing the example, based on his [blog post](https://aflah02.substack.com/p/multi-node-llm-inference-with-sglang).
|
||||
13
docs/references/multi_node_deployment/multi_node_index.rst
Normal file
13
docs/references/multi_node_deployment/multi_node_index.rst
Normal file
@@ -0,0 +1,13 @@
|
||||
Multi-Node Deployment
|
||||
=====================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Multi-Node Deployment
|
||||
|
||||
multi_node.md
|
||||
deploy_on_k8s.md
|
||||
lws_pd/lws_pd_deploy.md
|
||||
|
||||
- `Deploying DeepSeek with PD Disaggregation and Large-Scale Expert Parallelism on 96 H100 GPUs <https://lmsys.org/blog/2025-05-05-large-scale-ep/>`_
|
||||
- `Deploying Kimi K2 with PD Disaggregation and Large-Scale Expert Parallelism on 128 H200 GPUs <https://lmsys.org/blog/2025-07-20-k2-large-scale-ep/>`_
|
||||
217
docs/references/production_metrics.md
Normal file
217
docs/references/production_metrics.md
Normal file
@@ -0,0 +1,217 @@
|
||||
# Production Metrics
|
||||
|
||||
SGLang exposes the following metrics via Prometheus. You can enable it by adding `--enable-metrics` when you launch the server.
|
||||
|
||||
An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json).
|
||||
|
||||
Here is an example of the metrics:
|
||||
|
||||
```
|
||||
$ curl http://localhost:30000/metrics
|
||||
# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
|
||||
# TYPE sglang:prompt_tokens_total counter
|
||||
sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.128902e+06
|
||||
# HELP sglang:generation_tokens_total Number of generation tokens processed.
|
||||
# TYPE sglang:generation_tokens_total counter
|
||||
sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.557572e+06
|
||||
# HELP sglang:token_usage The token usage
|
||||
# TYPE sglang:token_usage gauge
|
||||
sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.28
|
||||
# HELP sglang:cache_hit_rate The cache hit rate
|
||||
# TYPE sglang:cache_hit_rate gauge
|
||||
sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.007507552643049313
|
||||
# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds.
|
||||
# TYPE sglang:time_to_first_token_seconds histogram
|
||||
sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2.3518979474117756e+06
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.25",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 27.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 314.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="7.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 941.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1330.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1970.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2326.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="25.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2417.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2513.0
|
||||
sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
|
||||
sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
|
||||
# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
|
||||
# TYPE sglang:e2e_request_latency_seconds histogram
|
||||
sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.116093850019932e+06
|
||||
sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="1.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="2.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 10.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 14.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 247.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="40.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 486.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="50.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 845.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="60.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1513.0
|
||||
sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
|
||||
sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
|
||||
# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds.
|
||||
# TYPE sglang:time_per_output_token_seconds histogram
|
||||
sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 866964.5791549598
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 73.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 382.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 593.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.025",model_name="meta-llama/Llama-3.1-8B-Instruct"} 855.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.03",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1035.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1815.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.05",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11685.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.075",model_name="meta-llama/Llama-3.1-8B-Instruct"} 433413.0
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 4.950195e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.15",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.039435e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.2",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.171662e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.266055e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.4",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.296752e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.312226e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.339675e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.357747e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.389414e+06
|
||||
sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
|
||||
sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
|
||||
# HELP sglang:func_latency_seconds Function latency in seconds
|
||||
# TYPE sglang:func_latency_seconds histogram
|
||||
sglang:func_latency_seconds_sum{name="generate_request"} 4.514771912145079
|
||||
sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="0.253125",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="0.3796875",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="0.56953125",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="0.8542968750000001",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="1.2814453125",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="1.9221679687500002",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="2.8832519531250003",name="generate_request"} 14006.0
|
||||
sglang:func_latency_seconds_bucket{le="4.3248779296875",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_bucket{le="6.487316894531251",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_bucket{le="9.730975341796876",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_bucket{le="14.596463012695313",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_bucket{le="21.89469451904297",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_bucket{le="32.84204177856446",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_bucket{le="49.26306266784668",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 14007.0
|
||||
sglang:func_latency_seconds_count{name="generate_request"} 14007.0
|
||||
# HELP sglang:num_running_reqs The number of running requests
|
||||
# TYPE sglang:num_running_reqs gauge
|
||||
sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 162.0
|
||||
# HELP sglang:num_used_tokens The number of used tokens
|
||||
# TYPE sglang:num_used_tokens gauge
|
||||
sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 123859.0
|
||||
# HELP sglang:gen_throughput The generate throughput (token/s)
|
||||
# TYPE sglang:gen_throughput gauge
|
||||
sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 86.50814177726902
|
||||
# HELP sglang:num_queue_reqs The number of requests in the waiting queue
|
||||
# TYPE sglang:num_queue_reqs gauge
|
||||
sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2826.0
|
||||
```
|
||||
|
||||
## Setup Guide
|
||||
|
||||
This section describes how to set up the monitoring stack (Prometheus + Grafana) provided in the `examples/monitoring` directory.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Docker and Docker Compose installed
|
||||
- SGLang server running with metrics enabled
|
||||
|
||||
### Usage
|
||||
|
||||
1. **Start your SGLang server with metrics enabled:**
|
||||
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path <your_model_path> --port 30000 --enable-metrics
|
||||
```
|
||||
Replace `<your_model_path>` with the actual path to your model (e.g., `meta-llama/Meta-Llama-3.1-8B-Instruct`). Ensure the server is accessible from the monitoring stack (you might need `--host 0.0.0.0` if running in Docker). By default, the metrics endpoint will be available at `http://<sglang_server_host>:30000/metrics`.
|
||||
|
||||
2. **Navigate to the monitoring example directory:**
|
||||
```bash
|
||||
cd examples/monitoring
|
||||
```
|
||||
|
||||
3. **Start the monitoring stack:**
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
This command will start Prometheus and Grafana in the background.
|
||||
|
||||
4. **Access the monitoring interfaces:**
|
||||
* **Grafana:** Open your web browser and go to [http://localhost:3000](http://localhost:3000).
|
||||
* **Prometheus:** Open your web browser and go to [http://localhost:9090](http://localhost:9090).
|
||||
|
||||
5. **Log in to Grafana:**
|
||||
* Default Username: `admin`
|
||||
* Default Password: `admin`
|
||||
You will be prompted to change the password upon your first login.
|
||||
|
||||
6. **View the Dashboard:**
|
||||
The SGLang dashboard is pre-configured and should be available automatically. Navigate to `Dashboards` -> `Browse` -> `SGLang Monitoring` folder -> `SGLang Dashboard`.
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
* **Port Conflicts:** If you encounter errors like "port is already allocated," check if other services (including previous instances of Prometheus/Grafana) are using ports `9090` or `3000`. Use `docker ps` to find running containers and `docker stop <container_id>` to stop them, or use `lsof -i :<port>` to find other processes using the ports. You might need to adjust the ports in the `docker-compose.yaml` file if they permanently conflict with other essential services on your system.
|
||||
|
||||
To modify Grafana's port to the other one(like 3090) in your Docker Compose file, you need to explicitly specify the port mapping under the grafana service.
|
||||
|
||||
Option 1: Add GF_SERVER_HTTP_PORT to the environment section:
|
||||
```
|
||||
environment:
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_SERVER_HTTP_PORT=3090 # <-- Add this line
|
||||
```
|
||||
Option 2: Use port mapping:
|
||||
```
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
ports:
|
||||
- "3090:3000" # <-- Host:Container port mapping
|
||||
```
|
||||
* **Connection Issues:**
|
||||
* Ensure both Prometheus and Grafana containers are running (`docker ps`).
|
||||
* Verify the Prometheus data source configuration in Grafana (usually auto-configured via `grafana/datasources/datasource.yaml`). Go to `Connections` -> `Data sources` -> `Prometheus`. The URL should point to the Prometheus service (e.g., `http://prometheus:9090`).
|
||||
* Confirm that your SGLang server is running and the metrics endpoint (`http://<sglang_server_host>:30000/metrics`) is accessible *from the Prometheus container*. If SGLang is running on your host machine and Prometheus is in Docker, use `host.docker.internal` (on Docker Desktop) or your machine's network IP instead of `localhost` in the `prometheus.yaml` scrape configuration.
|
||||
* **No Data on Dashboard:**
|
||||
* Generate some traffic to your SGLang server to produce metrics. For example, run a benchmark:
|
||||
```bash
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 100 --random-input 128 --random-output 128
|
||||
```
|
||||
* Check the Prometheus UI (`http://localhost:9090`) under `Status` -> `Targets` to see if the SGLang endpoint is being scraped successfully.
|
||||
* Verify the `model_name` and `instance` labels in your Prometheus metrics match the variables used in the Grafana dashboard. You might need to adjust the Grafana dashboard variables or the labels in your Prometheus configuration.
|
||||
|
||||
### Configuration Files
|
||||
|
||||
The monitoring setup is defined by the following files within the `examples/monitoring` directory:
|
||||
|
||||
* `docker-compose.yaml`: Defines the Prometheus and Grafana services.
|
||||
* `prometheus.yaml`: Prometheus configuration, including scrape targets.
|
||||
* `grafana/datasources/datasource.yaml`: Configures the Prometheus data source for Grafana.
|
||||
* `grafana/dashboards/config/dashboard.yaml`: Tells Grafana to load dashboards from the specified path.
|
||||
* `grafana/dashboards/json/sglang-dashboard.json`: The actual Grafana dashboard definition in JSON format.
|
||||
|
||||
You can customize the setup by modifying these files. For instance, you might need to update the `static_configs` target in `prometheus.yaml` if your SGLang server runs on a different host or port.
|
||||
|
||||
#### Check if the metrics are being collected
|
||||
|
||||
Run `python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5` to generate some requests.
|
||||
|
||||
Then you should be able to see the metrics in the Grafana dashboard.
|
||||
13
docs/references/torch_compile_cache.md
Normal file
13
docs/references/torch_compile_cache.md
Normal file
@@ -0,0 +1,13 @@
|
||||
# Enabling cache for torch.compile
|
||||
|
||||
SGLang uses `max-autotune-no-cudagraphs` mode of torch.compile. The auto-tuning can be slow.
|
||||
If you want to deploy a model on many different machines, you can ship the torch.compile cache to these machines and skip the compilation steps.
|
||||
|
||||
This is based on https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html
|
||||
|
||||
|
||||
1. Generate the cache by setting TORCHINDUCTOR_CACHE_DIR and running the model once.
|
||||
```
|
||||
TORCHINDUCTOR_CACHE_DIR=/root/inductor_root_cache python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile
|
||||
```
|
||||
2. Copy the cache folder to other machines and launch the server with `TORCHINDUCTOR_CACHE_DIR`.
|
||||
20
docs/requirements.txt
Normal file
20
docs/requirements.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
ipykernel
|
||||
ipywidgets
|
||||
jupyter_client
|
||||
markdown>=3.4.0
|
||||
matplotlib
|
||||
myst-parser
|
||||
nbconvert
|
||||
nbsphinx
|
||||
pandoc
|
||||
pillow
|
||||
pydantic
|
||||
sphinx
|
||||
sphinx-book-theme
|
||||
sphinx-copybutton
|
||||
sphinx-tabs
|
||||
nbstripout
|
||||
sphinxcontrib-mermaid
|
||||
urllib3<2.0.0
|
||||
gguf>=0.10.0
|
||||
sphinx-autobuild
|
||||
3
docs/serve.sh
Normal file
3
docs/serve.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
# Clean and serve documentation with auto-build
|
||||
make clean
|
||||
make serve
|
||||
87
docs/supported_models/embedding_models.md
Normal file
87
docs/supported_models/embedding_models.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# Embedding Models
|
||||
|
||||
SGLang provides robust support for embedding models by integrating efficient serving mechanisms with its flexible programming interface. This integration allows for streamlined handling of embedding tasks, facilitating faster and more accurate retrieval and semantic search operations. SGLang's architecture enables better resource utilization and reduced latency in embedding model deployment.
|
||||
|
||||
```{important}
|
||||
Embedding models are executed with `--is-embedding` flag and some may require `--trust-remote-code`
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Launch Server
|
||||
|
||||
```shell
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path Qwen/Qwen3-Embedding-4B \
|
||||
--is-embedding \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
### Client Request
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
url = "http://127.0.0.1:30000"
|
||||
|
||||
payload = {
|
||||
"model": "Qwen/Qwen3-Embedding-4B",
|
||||
"input": "What is the capital of France?",
|
||||
"encoding_format": "float"
|
||||
}
|
||||
|
||||
response = requests.post(url + "/v1/embeddings", json=payload).json()
|
||||
print("Embedding:", response["data"][0]["embedding"])
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Multimodal Embedding Example
|
||||
|
||||
For multimodal models like GME that support both text and images:
|
||||
|
||||
```shell
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct \
|
||||
--is-embedding \
|
||||
--chat-template gme-qwen2-vl \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
url = "http://127.0.0.1:30000"
|
||||
|
||||
text_input = "Represent this image in embedding space."
|
||||
image_path = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
|
||||
|
||||
payload = {
|
||||
"model": "gme-qwen2-vl",
|
||||
"input": [
|
||||
{
|
||||
"text": text_input
|
||||
},
|
||||
{
|
||||
"image": image_path
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
response = requests.post(url + "/v1/embeddings", json=payload).json()
|
||||
|
||||
print("Embeddings:", [x.get("embedding") for x in response.get("data", [])])
|
||||
```
|
||||
|
||||
## Supported Models
|
||||
|
||||
| Model Family | Example Model | Chat Template | Description |
|
||||
| ------------------------------------------ | -------------------------------------- | ------------- | --------------------------------------------------------------------------- |
|
||||
| **E5 (Llama/Mistral based)** | `intfloat/e5-mistral-7b-instruct` | N/A | High-quality text embeddings based on Mistral/Llama architectures |
|
||||
| **GTE-Qwen2** | `Alibaba-NLP/gte-Qwen2-7B-instruct` | N/A | Alibaba's text embedding model with multilingual support |
|
||||
| **Qwen3-Embedding** | `Qwen/Qwen3-Embedding-4B` | N/A | Latest Qwen3-based text embedding model for semantic representation |
|
||||
| **BGE** | `BAAI/bge-large-en-v1.5` | N/A | BAAI's text embeddings (requires `attention-backend` triton/torch_native) |
|
||||
| **GME (Multimodal)** | `Alibaba-NLP/gme-Qwen2-VL-2B-Instruct`| `gme-qwen2-vl`| Multimodal embedding for text and image cross-modal tasks |
|
||||
| **CLIP** | `openai/clip-vit-large-patch14-336` | N/A | OpenAI's CLIP for image and text embeddings |
|
||||
56
docs/supported_models/generative_models.md
Normal file
56
docs/supported_models/generative_models.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Large Language Models
|
||||
|
||||
These models accept text input and produce text output (e.g., chat completions). They are primarily large language models (LLMs), some with mixture-of-experts (MoE) architectures for scaling.
|
||||
|
||||
## Example launch Command
|
||||
|
||||
```shell
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.2-1B-Instruct \ # example HF/local path
|
||||
--host 0.0.0.0 \
|
||||
--port 30000 \
|
||||
```
|
||||
|
||||
## Supported models
|
||||
|
||||
Below the supported models are summarized in a table.
|
||||
|
||||
If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen3ForCausalLM`, use the expression:
|
||||
|
||||
```
|
||||
repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen3ForCausalLM
|
||||
```
|
||||
|
||||
in the GitHub search bar.
|
||||
|
||||
| Model Family (Variants) | Example HuggingFace Identifier | Description |
|
||||
|-------------------------------------|--------------------------------------------------|----------------------------------------------------------------------------------------|
|
||||
| **DeepSeek** (v1, v2, v3/R1) | `deepseek-ai/DeepSeek-R1` | Series of advanced reasoning-optimized models (including a 671B MoE) trained with reinforcement learning; top performance on complex reasoning, math, and code tasks. [SGLang provides Deepseek v3/R1 model-specific optimizations](../basic_usage/deepseek.md) and [Reasoning Parser](../advanced_features/separate_reasoning.ipynb)|
|
||||
| **GPT-OSS** | `openai/gpt-oss-20b`, `openai/gpt-oss-120b` | OpenAI’s latest GPT-OSS series for complex reasoning, agentic tasks, and versatile developer use cases.|
|
||||
| **Qwen** (3, 3MoE, 3Next, 2.5, 2 series) | `Qwen/Qwen3-0.6B`, `Qwen/Qwen3-30B-A3B` `Qwen/Qwen3-Next-80B-A3B-Instruct ` | Alibaba’s latest Qwen3 series for complex reasoning, language understanding, and generation tasks; Support for MoE variants along with previous generation 2.5, 2, etc. [SGLang provides Qwen3 specific reasoning parser](../advanced_features/separate_reasoning.ipynb)|
|
||||
| **Llama** (2, 3.x, 4 series) | `meta-llama/Llama-4-Scout-17B-16E-Instruct` | Meta's open LLM series, spanning 7B to 400B parameters (Llama 2, 3, and new Llama 4) with well-recognized performance. [SGLang provides Llama-4 model-specific optimizations](../basic_usage/llama4.md) |
|
||||
| **Mistral** (Mixtral, NeMo, Small3) | `mistralai/Mistral-7B-Instruct-v0.2` | Open 7B LLM by Mistral AI with strong performance; extended into MoE (“Mixtral”) and NeMo Megatron variants for larger scale. |
|
||||
| **Gemma** (v1, v2, v3) | `google/gemma-3-1b-it` | Google’s family of efficient multilingual models (1B–27B); Gemma 3 offers a 128K context window, and its larger (4B+) variants support vision input. |
|
||||
| **Phi** (Phi-1.5, Phi-2, Phi-3, Phi-4, Phi-MoE series) | `microsoft/Phi-4-multimodal-instruct`, `microsoft/Phi-3.5-MoE-instruct` | Microsoft’s Phi family of small models (1.3B–5.6B); Phi-4-multimodal (5.6B) processes text, images, and speech, Phi-4-mini is a high-accuracy text model and Phi-3.5-MoE is a mixture-of-experts model. |
|
||||
| **MiniCPM** (v3, 4B) | `openbmb/MiniCPM3-4B` | OpenBMB’s series of compact LLMs for edge devices; MiniCPM 3 (4B) achieves GPT-3.5-level results in text tasks. |
|
||||
| **OLMoE** (Open MoE) | `allenai/OLMoE-1B-7B-0924` | Allen AI’s open Mixture-of-Experts model (7B total, 1B active parameters) delivering state-of-the-art results with sparse expert activation. |
|
||||
| **StableLM** (3B, 7B) | `stabilityai/stablelm-tuned-alpha-7b` | StabilityAI’s early open-source LLM (3B & 7B) for general text generation; a demonstration model with basic instruction-following ability. |
|
||||
| **Command-R** (Cohere) | `CohereForAI/c4ai-command-r-v01` | Cohere’s open conversational LLM (Command series) optimized for long context, retrieval-augmented generation, and tool use. |
|
||||
| **DBRX** (Databricks) | `databricks/dbrx-instruct` | Databricks’ 132B-parameter MoE model (36B active) trained on 12T tokens; competes with GPT-3.5 quality as a fully open foundation model. |
|
||||
| **Grok** (xAI) | `xai-org/grok-1` | xAI’s grok-1 model known for vast size(314B parameters) and high quality; integrated in SGLang for high-performance inference. |
|
||||
| **ChatGLM** (GLM-130B family) | `THUDM/chatglm2-6b` | Zhipu AI’s bilingual chat model (6B) excelling at Chinese-English dialogue; fine-tuned for conversational quality and alignment. |
|
||||
| **InternLM 2** (7B, 20B) | `internlm/internlm2-7b` | Next-gen InternLM (7B and 20B) from SenseTime, offering strong reasoning and ultra-long context support (up to 200K tokens). |
|
||||
| **ExaONE 3** (Korean-English) | `LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct` | LG AI Research’s Korean-English model (7.8B) trained on 8T tokens; provides high-quality bilingual understanding and generation. |
|
||||
| **Baichuan 2** (7B, 13B) | `baichuan-inc/Baichuan2-13B-Chat` | BaichuanAI’s second-generation Chinese-English LLM (7B/13B) with improved performance and an open commercial license. |
|
||||
| **XVERSE** (MoE) | `xverse/XVERSE-MoE-A36B` | Yuanxiang’s open MoE LLM (XVERSE-MoE-A36B: 255B total, 36B active) supporting ~40 languages; delivers 100B+ dense-level performance via expert routing. |
|
||||
| **SmolLM** (135M–1.7B) | `HuggingFaceTB/SmolLM-1.7B` | Hugging Face’s ultra-small LLM series (135M–1.7B params) offering surprisingly strong results, enabling advanced AI on mobile/edge devices. |
|
||||
| **GLM-4** (Multilingual 9B) | `ZhipuAI/glm-4-9b-chat` | Zhipu’s GLM-4 series (up to 9B parameters) – open multilingual models with support for 1M-token context and even a 5.6B multimodal variant (Phi-4V). |
|
||||
| **MiMo** (7B series) | `XiaomiMiMo/MiMo-7B-RL` | Xiaomi's reasoning-optimized model series, leverages Multiple-Token Prediction for faster inference. |
|
||||
| **ERNIE-4.5** (4.5, 4.5MoE series) | `baidu/ERNIE-4.5-21B-A3B-PT` | Baidu's ERNIE-4.5 series which consists of MoE with 47B and 3B active parameters, with the largest model having 424B total parameters, as well as a 0.3B dense model. |
|
||||
| **Arcee AFM-4.5B** | `arcee-ai/AFM-4.5B-Base` | Arcee's foundational model series for real world reliability and edge deployments. |
|
||||
| **Persimmon** (8B) | `adept/persimmon-8b-chat` | Adept’s open 8B model with a 16K context window and fast inference; trained for broad usability and licensed under Apache 2.0. |
|
||||
| **Ling** (16.8B–290B) | `inclusionAI/Ling-lite`, `inclusionAI/Ling-plus` | InclusionAI’s open MoE models. Ling-Lite has 16.8B total / 2.75B active parameters, and Ling-Plus has 290B total / 28.8B active parameters. They are designed for high performance on NLP and complex reasoning tasks. |
|
||||
| **Granite 3.0, 3.1** (IBM) | `ibm-granite/granite-3.1-8b-instruct` | IBM's open dense foundation models optimized for reasoning, code, and business AI use cases. Integrated with Red Hat and watsonx systems. |
|
||||
| **Granite 3.0 MoE** (IBM) | `ibm-granite/granite-3.0-3b-a800m-instruct` | IBM’s Mixture-of-Experts models offering strong performance with cost-efficiency. MoE expert routing designed for enterprise deployment at scale. |
|
||||
| **Llama Nemotron Super** (v1, v1.5, NVIDIA) | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, `nvidia/Llama-3_3-Nemotron-Super-49B-v1_5` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family builds on the strongest open models in the ecosystem by enhancing them with greater accuracy, efficiency, and transparency using NVIDIA open synthetic datasets, advanced techniques, and tools. This enables the creation of practical, right-sized, and high-performing AI agents. |
|
||||
| **Llama Nemotron Ultra** (v1, NVIDIA) | `nvidia/Llama-3_1-Nemotron-Ultra-253B-v1` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family builds on the strongest open models in the ecosystem by enhancing them with greater accuracy, efficiency, and transparency using NVIDIA open synthetic datasets, advanced techniques, and tools. This enables the creation of practical, right-sized, and high-performing AI agents. |
|
||||
28
docs/supported_models/modelscope.md
Normal file
28
docs/supported_models/modelscope.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Use Models From ModelScope
|
||||
|
||||
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable `SGLANG_USE_MODELSCOPE`.
|
||||
|
||||
```bash
|
||||
export SGLANG_USE_MODELSCOPE=true
|
||||
```
|
||||
|
||||
We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example.
|
||||
|
||||
Launch the Server:
|
||||
```bash
|
||||
python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
||||
```
|
||||
|
||||
Or start it by docker:
|
||||
|
||||
```bash
|
||||
docker run --gpus all \
|
||||
-p 30000:30000 \
|
||||
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
||||
--env "SGLANG_USE_MODELSCOPE=true" \
|
||||
--ipc=host \
|
||||
lmsysorg/sglang:latest \
|
||||
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
||||
```
|
||||
|
||||
Note that modelscope uses a different cache directory than huggingface. You may need to set it manually to avoid running out of disk space.
|
||||
42
docs/supported_models/multimodal_language_models.md
Normal file
42
docs/supported_models/multimodal_language_models.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Multimodal Language Models
|
||||
|
||||
These models accept multi-modal inputs (e.g., images and text) and generate text output. They augment language models with multimodal encoders.
|
||||
|
||||
## Example launch Command
|
||||
|
||||
```shell
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.2-11B-Vision-Instruct \ # example HF/local path
|
||||
--host 0.0.0.0 \
|
||||
--port 30000 \
|
||||
```
|
||||
|
||||
## Supported models
|
||||
|
||||
Below the supported models are summarized in a table.
|
||||
|
||||
If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen2_5_VLForConditionalGeneration`, use the expression:
|
||||
|
||||
```
|
||||
repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen2_5_VLForConditionalGeneration
|
||||
```
|
||||
|
||||
in the GitHub search bar.
|
||||
|
||||
|
||||
| Model Family (Variants) | Example HuggingFace Identifier | Chat Template | Description |
|
||||
|----------------------------|--------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **Qwen-VL** (Qwen2 series) | `Qwen/Qwen2.5-VL-7B-Instruct` | `qwen2-vl` | Alibaba’s vision-language extension of Qwen; for example, Qwen2.5-VL (7B and larger variants) can analyze and converse about image content. |
|
||||
| **DeepSeek-VL2** | `deepseek-ai/deepseek-vl2` | `deepseek-vl2` | Vision-language variant of DeepSeek (with a dedicated image processor), enabling advanced multimodal reasoning on image and text inputs. |
|
||||
| **Janus-Pro** (1B, 7B) | `deepseek-ai/Janus-Pro-7B` | `janus-pro` | DeepSeek’s open-source multimodal model capable of both image understanding and generation. Janus-Pro employs a decoupled architecture for separate visual encoding paths, enhancing performance in both tasks. |
|
||||
| **MiniCPM-V / MiniCPM-o** | `openbmb/MiniCPM-V-2_6` | `minicpmv` | MiniCPM-V (2.6, ~8B) supports image inputs, and MiniCPM-o adds audio/video; these multimodal LLMs are optimized for end-side deployment on mobile/edge devices. |
|
||||
| **Llama 3.2 Vision** (11B) | `meta-llama/Llama-3.2-11B-Vision-Instruct` | `llama_3_vision` | Vision-enabled variant of Llama 3 (11B) that accepts image inputs for visual question answering and other multimodal tasks. |
|
||||
| **LLaVA** (v1.5 & v1.6) | *e.g.* `liuhaotian/llava-v1.5-13b` | `vicuna_v1.1` | Open vision-chat models that add an image encoder to LLaMA/Vicuna (e.g. LLaMA2 13B) for following multimodal instruction prompts. |
|
||||
| **LLaVA-NeXT** (8B, 72B) | `lmms-lab/llava-next-72b` | `chatml-llava` | Improved LLaVA models (with an 8B Llama3 version and a 72B version) offering enhanced visual instruction-following and accuracy on multimodal benchmarks. |
|
||||
| **LLaVA-OneVision** | `lmms-lab/llava-onevision-qwen2-7b-ov` | `chatml-llava` | Enhanced LLaVA variant integrating Qwen as the backbone; supports multiple images (and even video frames) as inputs via an OpenAI Vision API-compatible format. |
|
||||
| **Gemma 3 (Multimodal)** | `google/gemma-3-4b-it` | `gemma-it` | Gemma 3's larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context. |
|
||||
| **Kimi-VL** (A3B) | `moonshotai/Kimi-VL-A3B-Instruct` | `kimi-vl` | Kimi-VL is a multimodal model that can understand and generate text from images. |
|
||||
| **Mistral-Small-3.1-24B** | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral` | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |
|
||||
| **Phi-4-multimodal-instruct** | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm` | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. It supports text, vision and audio modalities in SGLang. |
|
||||
| **MiMo-VL** (7B) | `XiaomiMiMo/MiMo-VL-7B-RL` | `mimo-vl` | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. |
|
||||
| **GLM-4.5V** (106B) / **GLM-4.1V**(9B) | `zai-org/GLM-4.5V` | `glm-4v` | GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning |
|
||||
49
docs/supported_models/rerank_models.md
Normal file
49
docs/supported_models/rerank_models.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# Rerank Models
|
||||
|
||||
SGLang offers comprehensive support for rerank models by incorporating optimized serving frameworks with a flexible programming interface. This setup enables efficient processing of cross-encoder reranking tasks, improving the accuracy and relevance of search result ordering. SGLang’s design ensures high throughput and low latency during reranker model deployment, making it ideal for semantic-based result refinement in large-scale retrieval systems.
|
||||
|
||||
```{important}
|
||||
They are executed with `--is-embedding` and some may require `--trust-remote-code`
|
||||
```
|
||||
|
||||
## Example Launch Command
|
||||
|
||||
```shell
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path BAAI/bge-reranker-v2-m3 \
|
||||
--host 0.0.0.0 \
|
||||
--disable-radix-cache \
|
||||
--chunked-prefill-size -1 \
|
||||
--attention-backend triton \
|
||||
--is-embedding \
|
||||
--port 30000
|
||||
```
|
||||
|
||||
## Example Client Request
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
url = "http://127.0.0.1:30000/v1/rerank"
|
||||
|
||||
payload = {
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"query": "what is panda?",
|
||||
"documents": [
|
||||
"hi",
|
||||
"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
|
||||
]
|
||||
}
|
||||
|
||||
response = requests.post(url, json=payload)
|
||||
response_json = response.json()
|
||||
|
||||
for item in response_json:
|
||||
print(f"Score: {item['score']:.2f} - Document: '{item['document']}'")
|
||||
```
|
||||
|
||||
## Supported rerank models
|
||||
|
||||
| Model Family (Rerank) | Example HuggingFace Identifier | Chat Template | Description |
|
||||
|------------------------------------------------|--------------------------------------|---------------|----------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **BGE-Reranker (BgeRerankModel)** | `BAAI/bge-reranker-v2-m3` | N/A | Currently only support `attention-backend` `triton` and `torch_native`. high-performance cross-encoder reranker model from BAAI. Suitable for reranking search results based on semantic relevance. |
|
||||
28
docs/supported_models/reward_models.md
Normal file
28
docs/supported_models/reward_models.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Reward Models
|
||||
|
||||
These models output a scalar reward score or classification result, often used in reinforcement learning or content moderation tasks.
|
||||
|
||||
```{important}
|
||||
They are executed with `--is-embedding` and some may require `--trust-remote-code`.
|
||||
```
|
||||
|
||||
## Example launch Command
|
||||
|
||||
```shell
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path Qwen/Qwen2.5-Math-RM-72B \ # example HF/local path
|
||||
--is-embedding \
|
||||
--host 0.0.0.0 \
|
||||
--tp-size=4 \ # set for tensor parallelism
|
||||
--port 30000 \
|
||||
```
|
||||
|
||||
## Supported models
|
||||
|
||||
| Model Family (Reward) | Example HuggingFace Identifier | Description |
|
||||
|---------------------------------------------------------------------------|-----------------------------------------------------|---------------------------------------------------------------------------------|
|
||||
| **Llama (3.1 Reward / `LlamaForSequenceClassification`)** | `Skywork/Skywork-Reward-Llama-3.1-8B-v0.2` | Reward model (preference classifier) based on Llama 3.1 (8B) for scoring and ranking responses for RLHF. |
|
||||
| **Gemma 2 (27B Reward / `Gemma2ForSequenceClassification`)** | `Skywork/Skywork-Reward-Gemma-2-27B-v0.2` | Derived from Gemma‑2 (27B), this model provides human preference scoring for RLHF and multilingual tasks. |
|
||||
| **InternLM 2 (Reward / `InternLM2ForRewardMode`)** | `internlm/internlm2-7b-reward` | InternLM 2 (7B)–based reward model used in alignment pipelines to guide outputs toward preferred behavior. |
|
||||
| **Qwen2.5 (Reward - Math / `Qwen2ForRewardModel`)** | `Qwen/Qwen2.5-Math-RM-72B` | A 72B math-specialized RLHF reward model from the Qwen2.5 series, tuned for evaluating and refining responses. |
|
||||
| **Qwen2.5 (Reward - Sequence / `Qwen2ForSequenceClassification`)** | `jason9693/Qwen2.5-1.5B-apeach` | A smaller Qwen2.5 variant used for sequence classification, offering an alternative RLHF scoring mechanism. |
|
||||
320
docs/supported_models/support_new_models.md
Normal file
320
docs/supported_models/support_new_models.md
Normal file
@@ -0,0 +1,320 @@
|
||||
# How to Support New Models
|
||||
|
||||
This document explains how to add support for new language models and multimodal large language models (MLLMs) in
|
||||
SGLang. It also covers how to test new models and register external implementations.
|
||||
|
||||
## How to Support a New Language Model
|
||||
|
||||
To support a new model in SGLang, you only need to add a single file under
|
||||
the [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models). You can learn
|
||||
from existing model implementations and create a new file for your model. For most models, you should be able to find a
|
||||
similar model to start with (e.g., starting from Llama). Also refer how
|
||||
to [port a Model from vLLM to SGLang](#port-a-model-from-vllm-to-sglang)
|
||||
|
||||
## How to Support a New Multimodal Large Language Model
|
||||
|
||||
To support a new multimodal large language model (MLLM) in SGLang, there are several key components in addition to the
|
||||
standard LLM support:
|
||||
|
||||
1. **Register your new model as multimodal**:
|
||||
Extend `is_multimodal_model`
|
||||
in [model_config.py](https://github.com/sgl-project/sglang/blob/0ab3f437aba729b348a683ab32b35b214456efc7/python/sglang/srt/configs/model_config.py#L561)
|
||||
to return `True` for your model.
|
||||
|
||||
2. **Register a new chat-template**:
|
||||
Only when your default chat-template is unable to accept images as input: Register a new chat template in [conversation.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/conversation.py) and the corresponding matching function.
|
||||
|
||||
3. **Multimodal Data Processor**:
|
||||
Define a new `Processor` class that inherits from `BaseMultimodalProcessor` and register this processor as your
|
||||
model’s dedicated processor.
|
||||
See [multimodal_processor.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/multimodal/processors)
|
||||
for more details.
|
||||
|
||||
4. **Handle Multimodal Tokens**:
|
||||
Implement a `pad_input_ids` function for your new model. In this function, multimodal tokens in the prompt should be
|
||||
expanded (if necessary) and padded with multimodal-data-hashes so that SGLang can recognize different multimodal data
|
||||
with `RadixAttention`.
|
||||
|
||||
5. **Handle Image Feature Extraction**:
|
||||
Implement a `get_image_feature` function for your new model, which extracts image features from raw image data and converts them into the embeddings used by the language model.
|
||||
|
||||
6. **Adapt to Vision Attention**:
|
||||
Adapt the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.
|
||||
|
||||
You can refer to [Qwen2VL](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/qwen2_vl.py) or
|
||||
other mllm implementations. These models demonstrate how to correctly handle both multimodal and textual inputs.
|
||||
|
||||
## Testing and Debugging
|
||||
|
||||
Please note all your testing and benchmarking results in PR description.
|
||||
|
||||
### Interactive Debugging
|
||||
|
||||
For interactive debugging, compare the outputs of Hugging Face/Transformers and SGLang. The following two commands
|
||||
should give the same text output and very similar prefill logits:
|
||||
|
||||
- Get the reference output:
|
||||
```bash
|
||||
python3 scripts/playground/reference_hf.py --model-path [new model] --model-type {text,mllm}
|
||||
```
|
||||
- Get the SGLang output:
|
||||
```bash
|
||||
python3 -m sglang.bench_one_batch --correct --model [new model]
|
||||
```
|
||||
|
||||
### Add the Model to the Test Suite
|
||||
|
||||
To ensure the new model is well maintained, add it to the test suite by including it in the `ALL_OTHER_MODELS` list in
|
||||
the [test_generation_models.py](https://github.com/sgl-project/sglang/blob/main/test/srt/models/test_generation_models.py)
|
||||
file, test the new model on your local machine and report the results on demonstrative benchmarks (GSM8K, MMLU, MMMU,
|
||||
MMMU-Pro, etc.) in your PR. \\
|
||||
For VLMs, also include a test in `test_vision_openai_server_{x}.py` (e.g. [test_vision_openai_server_a.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_a.py), [test_vision_openai_server_b.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_b.py)).
|
||||
|
||||
|
||||
This is an example command to run to test a new model on your local machine:
|
||||
|
||||
```bash
|
||||
ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others
|
||||
```
|
||||
|
||||
### Benchmark
|
||||
|
||||
- **(Required) MMMU**: follow MMMU benchmark [README.md](https://github.com/sgl-project/sglang/blob/main/benchmark/mmmu/README.md) to get SGLang vs. HF Transformer accuracy comparison. The accuracy score from SGLang run should not be much lower than that from HF Transformer run. Similarly, follow https://docs.sglang.ai/developer_guide/benchmark_and_profiling.html to get performance comparison: TTFT and throughput must meet or exceed baselines (e.g., HF Transformer).
|
||||
- **(Optional) Other evals**: If you ran other evals, please note the results in PR description.
|
||||
|
||||
## Port a Model from vLLM to SGLang
|
||||
|
||||
The [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) is a valuable
|
||||
resource, as vLLM covers many models. SGLang reuses vLLM’s interface and some layers, making it easier to port models
|
||||
from vLLM to SGLang.
|
||||
|
||||
To port a model from vLLM to SGLang:
|
||||
|
||||
- Compare these two files for guidance:
|
||||
- [SGLang Llama Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama.py)
|
||||
- [vLLM Llama Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py)
|
||||
- The major differences include:
|
||||
- **Replace vLLM’s `Attention` with `RadixAttention`** (ensure you pass `layer_id` to `RadixAttention`).
|
||||
- **Replace vLLM’s `LogitsProcessor` with SGLang’s `LogitsProcessor`.**
|
||||
- **Replace the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.**
|
||||
- **Replace other vLLM layers** (such as `RMSNorm`, `SiluAndMul`) with SGLang layers.
|
||||
- **Remove `Sample`.**
|
||||
- **Change the `forward()` functions** and add a `forward_batch()` method.
|
||||
- **Add `EntryClass`** at the end.
|
||||
- **Ensure that the new implementation uses only SGLang components** and does not rely on any vLLM components.
|
||||
|
||||
Note: make sure you add your new model to the supported models list in the supported models documentation.
|
||||
|
||||
## Registering an External Model Implementation
|
||||
|
||||
In addition to the methods above, you can register your new model with the `ModelRegistry` before launching the server.
|
||||
This allows you to integrate your model without modifying the source code.
|
||||
|
||||
For example:
|
||||
|
||||
```python
|
||||
from sglang.srt.models.registry import ModelRegistry
|
||||
from sglang.srt.entrypoints.http_server import launch_server
|
||||
|
||||
# For a single model, add it to the registry:
|
||||
ModelRegistry.models[model_name] = model_class
|
||||
|
||||
# For multiple models, you can imitate the import_model_classes() function:
|
||||
from functools import lru_cache
|
||||
|
||||
@lru_cache()
|
||||
def import_new_model_classes():
|
||||
model_arch_name_to_cls = {}
|
||||
# Populate model_arch_name_to_cls with your new model classes.
|
||||
...
|
||||
return model_arch_name_to_cls
|
||||
|
||||
ModelRegistry.models.update(import_new_model_classes())
|
||||
|
||||
# Launch the server with your server arguments:
|
||||
launch_server(server_args)
|
||||
```
|
||||
|
||||
## Example: Implementing and Serving a Llama Wrapper Model
|
||||
|
||||
Below is an introductory, step-by-step walkthrough on how to implement a new model end-to-end in SGLang and then run it via the [Offline Engine](https://github.com/sgl-project/sglang/blob/main/docs/basic_usage/offline_engine_api.ipynb).
|
||||
|
||||
### Implementing Our Model
|
||||
|
||||
To keep things simple, this new model will be a simple wrapper around [Llama 3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), and our goal will be just to bias the output logits for each `forward` call by taking the square root of each individual logit.
|
||||
|
||||
Let's start by defining our model in a file called `llama_wrapper.py`.
|
||||
The first step is to import the necessary libraries from SRT, which is SGLang's internal backend.
|
||||
|
||||
```python
|
||||
# In the file `llama_wrapper.py`
|
||||
|
||||
import torch
|
||||
from transformers import LlamaConfig
|
||||
from typing import Optional
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
||||
|
||||
from sglang.srt.models.llama import LlamaForCausalLM
|
||||
```
|
||||
|
||||
Next, we declare a new `class` for our model and have it inherit from `LlamaForCausalLM`, which allows our model to access `LlamaForCausalLM`'s predefined modules and layers, such as `LlamaAttention` and `LlamaMLP`.
|
||||
Note that almost all model implementations take in `config` and `quant_config` as arguments for their `__init__` method; `config` and `quant_config` are passed in via [`model_loader/loader.py`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_loader/loader.py#L219).
|
||||
Because we have inherited from `LlamaForCausalLM`, we can pass our parameters directly to its constructor, which will set the member variables for us.
|
||||
|
||||
```python
|
||||
class LlamaWrapper(LlamaForCausalLM):
|
||||
def __init__(
|
||||
self,
|
||||
config: LlamaConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__(config=config, quant_config=quant_config, prefix=prefix)
|
||||
```
|
||||
|
||||
Now, we want to define the `forward` method, which is what will be called at inference time.
|
||||
Note that the signature for `forward` is essentially the same for any model; you can take a look at the other models defined in the [`models` directory](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/) for references.
|
||||
To see where exactly `forward` is called in the SGLang runtime's internals, take a look at [`forward_decode`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1705) and [`forward_extend`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1724) in the [`ModelRunner` class](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/model_executor/model_runner.py).
|
||||
|
||||
```python
|
||||
@torch.no_grad()
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
forward_batch: ForwardBatch,
|
||||
pp_proxy_tensors: Optional[PPProxyTensors] = None,
|
||||
input_embeds: Optional[torch.Tensor] = None,
|
||||
get_embedding: bool = False,
|
||||
) -> LogitsProcessorOutput:
|
||||
```
|
||||
|
||||
We now call the `__call__` method for `self.model` (which is a member variable that `LlamaForCausalLM` defines in its `__init__` method), which eventually calls `LlamaForCausalLM`'s `forward` method.
|
||||
After that, we feed the `hidden_states` into our model's `LogitsProcessor` (again defined in `LlamaForCausalLM`).
|
||||
|
||||
```python
|
||||
hidden_states = self.model(
|
||||
input_ids,
|
||||
positions,
|
||||
forward_batch,
|
||||
input_embeds,
|
||||
pp_proxy_tensors=pp_proxy_tensors,
|
||||
)
|
||||
|
||||
res: LogitsProcessorOutput = self.logits_processor(
|
||||
input_ids,
|
||||
hidden_states,
|
||||
self.lm_head,
|
||||
forward_batch,
|
||||
)
|
||||
```
|
||||
|
||||
After receiving the logits for the next token, we can finally perform our biasing step.
|
||||
|
||||
```python
|
||||
orig_logits = res.next_token_logits
|
||||
res.next_token_logits = torch.where(
|
||||
orig_logits > 0,
|
||||
orig_logits.sqrt(),
|
||||
orig_logits
|
||||
)
|
||||
|
||||
return res
|
||||
```
|
||||
Now, our `LlamaWrapper` model is created and ready to be served!
|
||||
|
||||
### Serving Our Model Via SGLang's Offline Engine
|
||||
|
||||
The next step of this walkthrough involves hosting our new model offline, so that it can be served locally and without an HTTP server.
|
||||
|
||||
First, create a new file called `run.py`.
|
||||
Now, we must ensure that SGLang's `ModelRegistry` can find our model.
|
||||
To do this, we first download the model's configuration and weights from Huggingface.
|
||||
|
||||
```python
|
||||
# In the file `run.py`
|
||||
|
||||
import asyncio
|
||||
from functools import lru_cache
|
||||
from huggingface_hub import snapshot_download
|
||||
from llama_wrapper import LlamaWrapper # Make sure to import our new model!
|
||||
import sglang as sgl
|
||||
from sglang.srt.models.registry import ModelRegistry
|
||||
|
||||
# Make sure to request access to this model on Huggingface, then export your
|
||||
# `HF_TOKEN` to download the model snapshot
|
||||
llama_dir = snapshot_download(
|
||||
repo_id="meta-llama/Llama-3.1-8B-Instruct",
|
||||
local_dir="./llama_ckpt",
|
||||
)
|
||||
```
|
||||
|
||||
Now that we have our model on disk, we want to point it to `LlamaWrapper` by changing the `architectures` field in `./llama_ckpt/config.json` to be `LlamaWrapper`.
|
||||
That way, when we pass in the path of our model checkpoint to SGLang, it will know that we want to use "LlamaWrapper" instead of "LlamaForCausalLM" as our model.
|
||||
|
||||
```python
|
||||
{
|
||||
"architectures": [
|
||||
# "LlamaForCausalLM"
|
||||
"LlamaWrapper"
|
||||
],
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
However, if we don't link our `LlamaWrapper` class to the "LlamaWrapper" registry keyword, then SGLang won't be able to find our model.
|
||||
Thus, to register our `LlamaWrapper`, we want to follow the steps in the above section titled "Registering an External Model Implementation".
|
||||
|
||||
```python
|
||||
@lru_cache()
|
||||
def import_new_model_classes():
|
||||
model_arch_name_to_cls = {"LlamaWrapper": LlamaWrapper}
|
||||
return model_arch_name_to_cls
|
||||
|
||||
ModelRegistry.models.update(import_new_model_classes())
|
||||
```
|
||||
|
||||
Lastly, when we create our `Engine`, we just pass in the path to the local model directory.
|
||||
Then, our `LlamaWrapper` is ready to be served; for this walkthrough, we will use SGLang `Engine`'s non-streaming asynchronous generation endpoint.
|
||||
|
||||
```python
|
||||
def main():
|
||||
llm = sgl.Engine(model_path="./llama_ckpt")
|
||||
sampling_params = {"temperature": 0.2, "top_k": 5}
|
||||
prompts = [
|
||||
"Write a short, neutral self-introduction for a fictional character. Hello, my name is",
|
||||
"Provide a concise factual statement about France’s capital city. The capital of France is",
|
||||
"Explain possible future trends in artificial intelligence. The future of AI is",
|
||||
]
|
||||
|
||||
asyncio.run(run_llm(llm, sampling_params, prompts))
|
||||
|
||||
llm.shutdown()
|
||||
|
||||
async def run_llm(
|
||||
llm,
|
||||
sampling_params,
|
||||
prompts,
|
||||
) -> None:
|
||||
outputs = await llm.async_generate(prompts, sampling_params)
|
||||
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"Generated text: {output['text']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
Now, when we call `python run.py`, we will get the outputs of our newly created model!
|
||||
|
||||
|
||||
## Documentation
|
||||
Add to table of supported models in [generative_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/generative_models.md) or [multimodal_language_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/multimodal_language_models.md)
|
||||
|
||||
---
|
||||
|
||||
By following these guidelines, you can add support for new language models and multimodal large language models in
|
||||
SGLang and ensure they are thoroughly tested and easily integrated into the system.
|
||||
58
docs/supported_models/transformers_fallback.md
Normal file
58
docs/supported_models/transformers_fallback.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Transformers fallback in SGLang
|
||||
|
||||
`sglang` can fall back to using models that are available in `transformers`. This works for most decoder-style language models and support for vision-language models is coming soon!
|
||||
|
||||
## Example launch Command
|
||||
|
||||
By default, we will use sglang implementation if it is available. Otherwise, we will fall back to transformers one. However, you can switch the implementation by setting `--model-impl` to `transformers`.
|
||||
|
||||
```shell
|
||||
python3 -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.2-1B-Instruct \
|
||||
--host 0.0.0.0 \
|
||||
--port 30000 \
|
||||
--model-impl transformers
|
||||
```
|
||||
|
||||
## Supported features
|
||||
|
||||
### Quantization
|
||||
|
||||
Transformers fall back has supported most of available quantization in SGLang (except GGUF). See [Quantization page](../advanced_features/quantization.md) for more information about supported quantization in SGLang.
|
||||
|
||||
### Remote code
|
||||
|
||||
This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
|
||||
|
||||
A model just needs the following two things:
|
||||
|
||||
```python
|
||||
from transformers import PreTrainedModel
|
||||
from torch import nn
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
|
||||
def forward(self, hidden_states, **kwargs): # <- kwargs are required
|
||||
|
||||
...
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
**kwargs,
|
||||
)
|
||||
...
|
||||
|
||||
class MyModel(PreTrainedModel):
|
||||
_supports_attention_backend = True
|
||||
```
|
||||
|
||||
Here is what happens in the background:
|
||||
|
||||
1. The config is loaded
|
||||
2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
|
||||
3. The `TransformersModel` backend is used. See `/srt/models/transformers`, which leverages `self.config._attn_implementation = "sglang"`, thus the need to use `ALL_ATTENTION_FUNCTIONS`.
|
||||
|
||||
That's it!
|
||||
47
docs/wrap_run_llm.py
Normal file
47
docs/wrap_run_llm.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
def insert_runllm_widget(html_content):
|
||||
# RunLLM Widget script to be inserted
|
||||
widget_script = """
|
||||
<!-- RunLLM Widget Script -->
|
||||
<script type="module" id="runllm-widget-script" src="https://widget.runllm.com" crossorigin="true" version="stable" runllm-keyboard-shortcut="Mod+j" runllm-name="SGLang Chatbot" runllm-position="BOTTOM_RIGHT" runllm-assistant-id="629" async></script>
|
||||
"""
|
||||
|
||||
# Find the closing body tag and insert the widget script before it
|
||||
return re.sub(r"</body>", f"{widget_script}\n</body>", html_content)
|
||||
|
||||
|
||||
def process_html_files(build_dir):
|
||||
for root, dirs, files in os.walk(build_dir):
|
||||
for file in files:
|
||||
if file.endswith(".html"):
|
||||
file_path = os.path.join(root, file)
|
||||
|
||||
# Read the HTML file
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Insert the RunLLM widget
|
||||
modified_content = insert_runllm_widget(content)
|
||||
|
||||
# Write back the modified content
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(modified_content)
|
||||
|
||||
|
||||
def main():
|
||||
# Get the build directory path
|
||||
build_dir = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "_build", "html"
|
||||
)
|
||||
# Process all HTML files
|
||||
if os.path.exists(build_dir):
|
||||
process_html_files(build_dir)
|
||||
else:
|
||||
print(f"Build directory not found: {build_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user