docs: init readthedocs support (#783)
This commit is contained in:
@@ -96,7 +96,7 @@ curl http://localhost:30000/generate \
|
|||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
Learn more about the argument format [here](docs/sampling_params.md).
|
Learn more about the argument format [here](docs/en/sampling_params.md).
|
||||||
|
|
||||||
### OpenAI Compatible API
|
### OpenAI Compatible API
|
||||||
In addition, the server supports OpenAI-compatible APIs.
|
In addition, the server supports OpenAI-compatible APIs.
|
||||||
@@ -143,7 +143,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
```
|
```
|
||||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
||||||
```
|
```
|
||||||
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
||||||
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
||||||
```
|
```
|
||||||
# Node 0
|
# Node 0
|
||||||
@@ -152,7 +152,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
# Node 1
|
# Node 1
|
||||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
||||||
```
|
```
|
||||||
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
||||||
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||||
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
||||||
|
|
||||||
@@ -195,7 +195,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|||||||
- InternLM 2
|
- InternLM 2
|
||||||
- Mistral NeMo
|
- Mistral NeMo
|
||||||
|
|
||||||
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
||||||
|
|
||||||
### Benchmark Performance
|
### Benchmark Performance
|
||||||
|
|
||||||
|
|||||||
17
docs/en/.readthedocs.yaml
Normal file
17
docs/en/.readthedocs.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
version: 2
|
||||||
|
|
||||||
|
formats: all
|
||||||
|
|
||||||
|
build:
|
||||||
|
os: "ubuntu-22.04"
|
||||||
|
tools:
|
||||||
|
python: "3.12"
|
||||||
|
|
||||||
|
|
||||||
|
sphinx:
|
||||||
|
configuration: docs/en/conf.py
|
||||||
|
|
||||||
|
|
||||||
|
python:
|
||||||
|
install:
|
||||||
|
- requirements: docs/requirements.txt
|
||||||
12
docs/en/Makefile
Normal file
12
docs/en/Makefile
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
SPHINXOPTS =
|
||||||
|
SPHINXBUILD = sphinx-build
|
||||||
|
SOURCEDIR = .
|
||||||
|
BUILDDIR = _build
|
||||||
|
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
9
docs/en/_static/css/readthedocs.css
Normal file
9
docs/en/_static/css/readthedocs.css
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
table.autosummary td {
|
||||||
|
width: 50%
|
||||||
|
}
|
||||||
|
|
||||||
|
img.align-center {
|
||||||
|
display: block;
|
||||||
|
margin-left: auto;
|
||||||
|
margin-right: auto;
|
||||||
|
}
|
||||||
125
docs/en/conf.py
Normal file
125
docs/en/conf.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.abspath("../.."))
|
||||||
|
|
||||||
|
version_file = "../../python/sglang/version.py"
|
||||||
|
with open(version_file, "r") as f:
|
||||||
|
exec(compile(f.read(), version_file, "exec"))
|
||||||
|
__version__ = locals()["__version__"]
|
||||||
|
|
||||||
|
project = "SGLang"
|
||||||
|
copyright = "2023-2024, SGLang"
|
||||||
|
author = "SGLang Team"
|
||||||
|
|
||||||
|
version = __version__
|
||||||
|
release = __version__
|
||||||
|
|
||||||
|
extensions = [
|
||||||
|
"sphinx.ext.autodoc",
|
||||||
|
"sphinx.ext.autosummary",
|
||||||
|
"sphinx.ext.napoleon",
|
||||||
|
"sphinx.ext.viewcode",
|
||||||
|
"sphinx.ext.autosectionlabel",
|
||||||
|
"sphinx.ext.intersphinx",
|
||||||
|
"sphinx_tabs.tabs",
|
||||||
|
"myst_parser",
|
||||||
|
"sphinx_copybutton",
|
||||||
|
"sphinxcontrib.mermaid",
|
||||||
|
]
|
||||||
|
|
||||||
|
autosectionlabel_prefix_document = True
|
||||||
|
|
||||||
|
templates_path = ["_templates"]
|
||||||
|
|
||||||
|
source_suffix = {
|
||||||
|
".rst": "restructuredtext",
|
||||||
|
".md": "markdown",
|
||||||
|
}
|
||||||
|
|
||||||
|
master_doc = "index"
|
||||||
|
|
||||||
|
language = "en"
|
||||||
|
|
||||||
|
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
||||||
|
|
||||||
|
pygments_style = "sphinx"
|
||||||
|
|
||||||
|
html_theme = "sphinx_book_theme"
|
||||||
|
html_logo = "_static/image/logo.png"
|
||||||
|
html_title = project
|
||||||
|
html_copy_source = True
|
||||||
|
html_last_updated_fmt = ""
|
||||||
|
|
||||||
|
html_theme_options = {
|
||||||
|
"path_to_docs": "docs/en",
|
||||||
|
"repository_url": "https://github.com/sgl-project/sglang",
|
||||||
|
"repository_branch": "main",
|
||||||
|
"show_navbar_depth": 3,
|
||||||
|
"max_navbar_depth": 4,
|
||||||
|
"collapse_navbar": True,
|
||||||
|
"use_edit_page_button": True,
|
||||||
|
"use_source_button": True,
|
||||||
|
"use_issues_button": True,
|
||||||
|
"use_repository_button": True,
|
||||||
|
"use_download_button": True,
|
||||||
|
"use_sidenotes": True,
|
||||||
|
"show_toc_level": 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
html_static_path = ["_static"]
|
||||||
|
html_css_files = ["css/readthedocs.css"]
|
||||||
|
|
||||||
|
myst_enable_extensions = [
|
||||||
|
"dollarmath",
|
||||||
|
"amsmath",
|
||||||
|
"deflist",
|
||||||
|
"colon_fence",
|
||||||
|
]
|
||||||
|
myst_heading_anchors = 5
|
||||||
|
|
||||||
|
htmlhelp_basename = "sglangdoc"
|
||||||
|
|
||||||
|
latex_elements = {}
|
||||||
|
|
||||||
|
latex_documents = [
|
||||||
|
(master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"),
|
||||||
|
]
|
||||||
|
|
||||||
|
man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)]
|
||||||
|
|
||||||
|
texinfo_documents = [
|
||||||
|
(
|
||||||
|
master_doc,
|
||||||
|
"sglang",
|
||||||
|
"sglang Documentation",
|
||||||
|
author,
|
||||||
|
"sglang",
|
||||||
|
"One line description of project.",
|
||||||
|
"Miscellaneous",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
epub_title = project
|
||||||
|
|
||||||
|
epub_exclude_files = ["search.html"]
|
||||||
|
|
||||||
|
copybutton_prompt_text = r">>> |\.\.\. "
|
||||||
|
copybutton_prompt_is_regexp = True
|
||||||
|
|
||||||
|
autodoc_preserve_defaults = True
|
||||||
|
navigation_with_keys = False
|
||||||
|
|
||||||
|
autodoc_mock_imports = [
|
||||||
|
"torch",
|
||||||
|
"transformers",
|
||||||
|
"triton",
|
||||||
|
]
|
||||||
|
|
||||||
|
intersphinx_mapping = {
|
||||||
|
"python": ("https://docs.python.org/3.12", None),
|
||||||
|
"typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None),
|
||||||
|
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
|
||||||
|
"numpy": ("https://numpy.org/doc/stable", None),
|
||||||
|
"torch": ("https://pytorch.org/docs/stable", None),
|
||||||
|
}
|
||||||
65
docs/en/index.rst
Normal file
65
docs/en/index.rst
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
Welcome to SGLang's tutorials!
|
||||||
|
====================================
|
||||||
|
|
||||||
|
.. figure:: ./_static/image/logo.png
|
||||||
|
:width: 50%
|
||||||
|
:align: center
|
||||||
|
:alt: SGLang
|
||||||
|
:class: no-scaled-link
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<p style="text-align:center">
|
||||||
|
<strong>SGLang is yet another fast serving framework for large language models and vision language models.
|
||||||
|
</strong>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p style="text-align:center">
|
||||||
|
<script async defer src="https://buttons.github.io/buttons.js"></script>
|
||||||
|
<a class="github-button" href="https://github.com/sgl-project/sglang" data-show-count="true" data-size="large" aria-label="Star">Star</a>
|
||||||
|
<a class="github-button" href="https://github.com/sgl-project/sglang/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
|
||||||
|
<a class="github-button" href="https://github.com/sgl-project/sglang/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
SGLang has the following core features:
|
||||||
|
|
||||||
|
* **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
||||||
|
|
||||||
|
* **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
||||||
|
|
||||||
|
Documentation
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. _hyperparameter_tuning:
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:caption: Hyperparameter Tuning
|
||||||
|
|
||||||
|
hyperparameter_tuning.md
|
||||||
|
|
||||||
|
.. _custom_chat_template:
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:caption: Custom Chat Template
|
||||||
|
|
||||||
|
custom_chat_template.md
|
||||||
|
|
||||||
|
.. _model_support:
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:caption: Model Support
|
||||||
|
|
||||||
|
model_support.md
|
||||||
|
|
||||||
|
.. _sampling_params:
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:caption: Sampling Params
|
||||||
|
|
||||||
|
sampling_params.md
|
||||||
|
|
||||||
|
Indices and tables
|
||||||
|
==================
|
||||||
|
|
||||||
|
* :ref:`genindex`
|
||||||
|
* :ref:`search`
|
||||||
12
docs/requirements.txt
Normal file
12
docs/requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
markdown>=3.4.0
|
||||||
|
myst-parser
|
||||||
|
sphinx
|
||||||
|
sphinx-book-theme
|
||||||
|
sphinx-copybutton
|
||||||
|
sphinx-tabs
|
||||||
|
sphinxcontrib-mermaid
|
||||||
|
pillow
|
||||||
|
pydantic
|
||||||
|
torch
|
||||||
|
transformers
|
||||||
|
urllib3<2.0.0
|
||||||
@@ -75,7 +75,7 @@ def gen(
|
|||||||
choices: Optional[List[str]] = None,
|
choices: Optional[List[str]] = None,
|
||||||
regex: Optional[str] = None,
|
regex: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
||||||
|
|
||||||
if choices:
|
if choices:
|
||||||
return SglSelect(name, choices, 0.0 if temperature is None else temperature)
|
return SglSelect(name, choices, 0.0 if temperature is None else temperature)
|
||||||
|
|||||||
@@ -410,7 +410,7 @@ class SglGen(SglExpr):
|
|||||||
dtype: Optional[type] = None,
|
dtype: Optional[type] = None,
|
||||||
regex: Optional[str] = None,
|
regex: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.name = name
|
self.name = name
|
||||||
self.sampling_params = SglSamplingParams(
|
self.sampling_params = SglSamplingParams(
|
||||||
|
|||||||
Reference in New Issue
Block a user