docs: init readthedocs support (#783)

This commit is contained in:
Yineng Zhang
2024-07-28 16:50:31 +10:00
committed by GitHub
parent 68e5262699
commit 948625799e
16 changed files with 246 additions and 6 deletions

17
docs/en/.readthedocs.yaml Normal file
View File

@@ -0,0 +1,17 @@
version: 2
formats: all
build:
os: "ubuntu-22.04"
tools:
python: "3.12"
sphinx:
configuration: docs/en/conf.py
python:
install:
- requirements: docs/requirements.txt

12
docs/en/Makefile Normal file
View File

@@ -0,0 +1,12 @@
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = .
BUILDDIR = _build
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

View File

@@ -0,0 +1,9 @@
table.autosummary td {
width: 50%
}
img.align-center {
display: block;
margin-left: auto;
margin-right: auto;
}

View File

@@ -0,0 +1,22 @@
# Benchmark Results
We tested our system on the following common LLM workloads and reported the achieved throughput:
- **[MMLU](https://arxiv.org/abs/2009.03300)**: A 5-shot, multi-choice, multi-task benchmark.
- **[HellaSwag](https://arxiv.org/abs/1905.07830)**: A 20-shot, multi-choice sentence completion benchmark.
- **[ReAct Agent](https://arxiv.org/abs/2210.03629)**: An agent task using prompt traces collected from the original ReAct paper.
- **[Tree-of-Thought](https://arxiv.org/pdf/2305.10601.pdf)**: A custom tree search-based prompt for solving GSM-8K problems.
- **JSON Decode**: Extracting information from a Wikipedia page and outputting it in JSON format.
- **Chat (short)**: A synthetic chat benchmark where each conversation includes 4 turns with short LLM outputs.
- **Chat (long)**: A synthetic chat benchmark where each conversation includes 4 turns with long LLM outputs.
- **[DSPy RAG](https://github.com/stanfordnlp/dspy)**: A retrieval-augmented generation pipeline in the DSPy tutorial.
- **[LLaVA Bench](https://github.com/haotian-liu/LLaVA)**: Running LLaVA v1.5, a vision language model on the LLaVA-in-the-wild benchmark.
We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, Hugging Face TGI v1.3.0, and SGLang v0.1.5.
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
![llama_7b](../assets/llama_7b.jpg)
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
![mixtral_8x7b](../assets/mixtral_8x7b.jpg)
The benchmark code is available [here](https://github.com/sgl-project/sglang/tree/main/benchmark).

125
docs/en/conf.py Normal file
View File

@@ -0,0 +1,125 @@
import os
import sys
sys.path.insert(0, os.path.abspath("../.."))
version_file = "../../python/sglang/version.py"
with open(version_file, "r") as f:
exec(compile(f.read(), version_file, "exec"))
__version__ = locals()["__version__"]
project = "SGLang"
copyright = "2023-2024, SGLang"
author = "SGLang Team"
version = __version__
release = __version__
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.napoleon",
"sphinx.ext.viewcode",
"sphinx.ext.autosectionlabel",
"sphinx.ext.intersphinx",
"sphinx_tabs.tabs",
"myst_parser",
"sphinx_copybutton",
"sphinxcontrib.mermaid",
]
autosectionlabel_prefix_document = True
templates_path = ["_templates"]
source_suffix = {
".rst": "restructuredtext",
".md": "markdown",
}
master_doc = "index"
language = "en"
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
pygments_style = "sphinx"
html_theme = "sphinx_book_theme"
html_logo = "_static/image/logo.png"
html_title = project
html_copy_source = True
html_last_updated_fmt = ""
html_theme_options = {
"path_to_docs": "docs/en",
"repository_url": "https://github.com/sgl-project/sglang",
"repository_branch": "main",
"show_navbar_depth": 3,
"max_navbar_depth": 4,
"collapse_navbar": True,
"use_edit_page_button": True,
"use_source_button": True,
"use_issues_button": True,
"use_repository_button": True,
"use_download_button": True,
"use_sidenotes": True,
"show_toc_level": 2,
}
html_static_path = ["_static"]
html_css_files = ["css/readthedocs.css"]
myst_enable_extensions = [
"dollarmath",
"amsmath",
"deflist",
"colon_fence",
]
myst_heading_anchors = 5
htmlhelp_basename = "sglangdoc"
latex_elements = {}
latex_documents = [
(master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"),
]
man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)]
texinfo_documents = [
(
master_doc,
"sglang",
"sglang Documentation",
author,
"sglang",
"One line description of project.",
"Miscellaneous",
),
]
epub_title = project
epub_exclude_files = ["search.html"]
copybutton_prompt_text = r">>> |\.\.\. "
copybutton_prompt_is_regexp = True
autodoc_preserve_defaults = True
navigation_with_keys = False
autodoc_mock_imports = [
"torch",
"transformers",
"triton",
]
intersphinx_mapping = {
"python": ("https://docs.python.org/3.12", None),
"typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None),
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
"numpy": ("https://numpy.org/doc/stable", None),
"torch": ("https://pytorch.org/docs/stable", None),
}

View File

@@ -0,0 +1,28 @@
# Custom Chat Template in SGLang Runtime
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
If needed, you can also override the chat template when launching the server:
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
```
If the chat template you are looking for is missing, you are welcome to contribute it.
Meanwhile, you can also temporarily register your chat template as follows:
```json
{
"name": "my_model",
"system": "<|im_start|>system",
"user": "<|im_start|>user",
"assistant": "<|im_start|>assistant",
"sep_style": "CHATML",
"sep": "<|im_end|>",
"stop_str": ["<|im_end|>", "<|im_start|>"]
}
```
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.json
```

View File

@@ -0,0 +1,35 @@
# Guide on Hyperparameter Tuning
## Achieving Peak Throughput
Achieving a large batch size is the most important thing for attaining high throughput.
When the server is running at full load, look for the following in the log:
```[gpu_id=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
### Tune Your Request Submission Speed
`#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed.
A healthy range for `#queue-req` is `100 - 3000`.
### Tune `--schedule-conservativeness`
`token usage` indicates the KV cache memory utilization of the server. `token usage > 0.9` means good utilization.
If you frequently see `token usage < 0.9` and `#queue-req > 0`, it means the server is too conservative about taking in new requests. You can decrease `--schedule-conservativeness` to a value like 0.3.
The case of serving being too conservative can happen when users send many requests with a large `max_new_tokens` but the requests stop very early due to EOS or stop strings.
On the other hand, if you see `token usage` very high and you frequently see warnings like
`decode out of memory happened, #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3.
### Tune `--dp-size` and `--tp-size`
Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput.
### (Minor) Tune `--max-prefill-tokens`, `--mem-fraction-static`, `--max-running-requests`
If you see out of memory (OOM) errors, you can decrease these parameters.
If OOM happens during prefill, try to decrease `--max-prefill-tokens`.
If OOM happens during decoding, try to decrease `--max-running-requests`.
You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
### (Minor) Tune `--schedule-heuristic`
If you have many shared prefixes, use the default `--schedule-heuristic lpm`. `lpm` stands for longest prefix match.
When you have no shared prefixes at all or you always send the requests with the shared prefixes together,
you can try `--schedule-heuristic fcfs`. `fcfs` stands for first come first serve.

65
docs/en/index.rst Normal file
View File

@@ -0,0 +1,65 @@
Welcome to SGLang's tutorials!
====================================
.. figure:: ./_static/image/logo.png
:width: 50%
:align: center
:alt: SGLang
:class: no-scaled-link
.. raw:: html
<p style="text-align:center">
<strong>SGLang is yet another fast serving framework for large language models and vision language models.
</strong>
</p>
<p style="text-align:center">
<script async defer src="https://buttons.github.io/buttons.js"></script>
<a class="github-button" href="https://github.com/sgl-project/sglang" data-show-count="true" data-size="large" aria-label="Star">Star</a>
<a class="github-button" href="https://github.com/sgl-project/sglang/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/sgl-project/sglang/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
</p>
SGLang has the following core features:
* **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
* **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
Documentation
-------------
.. _hyperparameter_tuning:
.. toctree::
:maxdepth: 1
:caption: Hyperparameter Tuning
hyperparameter_tuning.md
.. _custom_chat_template:
.. toctree::
:maxdepth: 1
:caption: Custom Chat Template
custom_chat_template.md
.. _model_support:
.. toctree::
:maxdepth: 1
:caption: Model Support
model_support.md
.. _sampling_params:
.. toctree::
:maxdepth: 1
:caption: Sampling Params
sampling_params.md
Indices and tables
==================
* :ref:`genindex`
* :ref:`search`

16
docs/en/model_support.md Normal file
View File

@@ -0,0 +1,16 @@
# How to Support a New Model
To support a new model in SGLang, you only need to add a single file under [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models). You can learn from existing model implementations and create new files for the new models. Most models are based on the transformer architecture, making them very similar.
Another valuable resource is the [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models). vLLM has extensive coverage of models, and SGLang has reused vLLM for most parts of the model implementations. This similarity makes it easy to port many models from vLLM to SGLang.
To port a model from vLLM to SGLang, you can compare these two files [SGLang LLaMA Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama2.py) and [vLLM LLaMA Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). This comparison will help you understand how to convert a model implementation from vLLM to SGLang. The major difference is the replacement of PagedAttention with RadixAttention. The other parts are almost identical. Specifically,
- Replace vllm's `Attention` with `RadixAttention`.
- Replace vllm's `LogitsProcessor` with SGLang's `LogitsProcessor`.
- Remove `Sample`.
- Change `forward()` functions, and add `input_metadata`.
- Add `EntryClass` at the end.
- Test correctness by comparing the final logits and outputs of the two following commands:
- `python3 playground/reference_hf.py --model [new model]`
- `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
- Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md).

View File

@@ -0,0 +1,18 @@
# PyPI Package Release Process
## Update the version in code
Update the package version in `python/pyproject.toml` and `python/sglang/__init__.py`.
## Upload the PyPI package
```
pip install build twine
```
```
cd python
bash upload_pypi.sh
```
## Make a release in GitHub
Make a new release https://github.com/sgl-project/sglang/releases/new.

144
docs/en/sampling_params.md Normal file
View File

@@ -0,0 +1,144 @@
# Sampling Parameters in SGLang Runtime
This doc describes the sampling parameters of the SGLang Runtime.
The `/generate` endpoint accepts the following arguments in the JSON format.
```python
@dataclass
class GenerateReqInput:
# The input prompt. It can be a single prompt or a batch of prompts.
text: Optional[Union[List[str], str]] = None
# The token ids for text; one can either specify text or input_ids.
input_ids: Optional[Union[List[List[int]], List[int]]] = None
# The image input. It can be a file name, a url, or base64 encoded string.
# See also python/sglang/srt/utils.py:load_image.
image_data: Optional[Union[List[str], str]] = None
# The sampling_params. See descriptions below.
sampling_params: Union[List[Dict], Dict] = None
# The request id.
rid: Optional[Union[List[str], str]] = None
# Whether to return logprobs.
return_logprob: Optional[Union[List[bool], bool]] = None
# The start location of the prompt for return_logprob.
logprob_start_len: Optional[Union[List[int], int]] = None
# The number of top logprobs to return.
top_logprobs_num: Optional[Union[List[int], int]] = None
# Whether to detokenize tokens in text in the returned logprobs.
return_text_in_logprobs: bool = False
# Whether to stream output.
stream: bool = False
```
The `sampling_params` follows this format
```python
# The maximum number of output tokens
max_new_tokens: int = 16,
# Stop when hitting any of the strings in this list.
stop: Optional[Union[str, List[str]]] = None,
# Sampling temperature
temperature: float = 1.0,
# Top-p sampling
top_p: float = 1.0,
# Top-k sampling
top_k: int = -1,
# Whether to ignore EOS token.
ignore_eos: bool = False,
# Whether to skip the special tokens during detokenization.
skip_special_tokens: bool = True,
# Whether to add spaces between special tokens during detokenization.
spaces_between_special_tokens: bool = True,
# Constrains the output to follow a given regular expression.
regex: Optional[str] = None,
# Do parallel sampling and return `n` outputs.
n: int = 1,
```
## Examples
### Normal
Launch a server
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
```
Send a request
```python
import requests
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
print(response.json())
```
### Streaming
Send a request and stream the output
```python
import requests, json
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 256,
},
"stream": True,
},
stream=True,
)
prev = 0
for chunk in response.iter_lines(decode_unicode=False):
chunk = chunk.decode("utf-8")
if chunk and chunk.startswith("data:"):
if chunk == "data: [DONE]":
break
data = json.loads(chunk[5:].strip("\n"))
output = data["text"].strip()
print(output[prev:], end="", flush=True)
prev = len(output)
print("")
```
### Multi modal
Launch a server
```
python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000
```
Download an image
```
curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true
```
Send a request
```python
import requests
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:",
"image_data": "example_image.png",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
print(response.json())
```
The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
Streaming is supported in a similar manner as [above](#streaming).

100
docs/en/test_process.md Normal file
View File

@@ -0,0 +1,100 @@
# SRT Unit Tests
### Latency Alignment
Make sure your changes do not slow down the following benchmarks
```
# single gpu
python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256
python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256
# multiple gpu
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32
# moe model
python -m sglang.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32
```
### High-level API
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
cd test/lang
python3 test_srt_backend.py
```
### Performance
#### MMLU
```
cd benchmark/mmlu
```
Follow README.md to download the data.
```
python3 bench_sglang.py --nsub 3
# Expected performance on A10G
# Total latency: 8.200
# Average accuracy: 0.413
```
#### GSM-8K
```
cd benchmark/gsm8k
```
Follow README.md to download the data.
```
python3 bench_sglang.py --num-q 200
# Expected performance on A10G
# Latency: 32.103
# Accuracy: 0.250
```
#### More
Please also test `benchmark/hellaswag`, `benchmark/latency_throughput`.
### More Models
#### LLaVA
```
python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
```
```
cd benchmark/llava_bench
python3 bench_sglang.py
# Expected performance on A10G
# Latency: 50.031
```
## SGLang Unit Tests
```
export ANTHROPIC_API_KEY=
export OPENAI_API_KEY=
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
cd test/lang
python3 run_all.py
```
## OpenAI API server
```
cd test/srt
python test_openai_server.py
```
## Format
pip3 install pre-commit
cd sglang
pre-commit install
pre-commit run --all-files