diff --git a/README.md b/README.md index 4715852fe..477392350 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ curl http://localhost:30000/generate \ } }' ``` -Learn more about the argument format [here](docs/sampling_params.md). +Learn more about the argument format [here](docs/en/sampling_params.md). ### OpenAI Compatible API In addition, the server supports OpenAI-compatible APIs. @@ -143,7 +143,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ``` python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7 ``` -- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance. +- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance. - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port. ``` # Node 0 @@ -152,7 +152,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct # Node 1 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1 ``` -- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md). +- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md). - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes. @@ -195,7 +195,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/ - InternLM 2 - Mistral NeMo -Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md). +Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md). ### Benchmark Performance diff --git a/docs/en/.readthedocs.yaml b/docs/en/.readthedocs.yaml new file mode 100644 index 000000000..94f52e9a0 --- /dev/null +++ b/docs/en/.readthedocs.yaml @@ -0,0 +1,17 @@ +version: 2 + +formats: all + +build: + os: "ubuntu-22.04" + tools: + python: "3.12" + + +sphinx: + configuration: docs/en/conf.py + + +python: + install: + - requirements: docs/requirements.txt diff --git a/docs/en/Makefile b/docs/en/Makefile new file mode 100644 index 000000000..9ad4b38e0 --- /dev/null +++ b/docs/en/Makefile @@ -0,0 +1,12 @@ +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/en/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css new file mode 100644 index 000000000..aca6649b4 --- /dev/null +++ b/docs/en/_static/css/readthedocs.css @@ -0,0 +1,9 @@ +table.autosummary td { + width: 50% +} + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} diff --git a/docs/benchmark_results.md b/docs/en/benchmark_results.md similarity index 100% rename from docs/benchmark_results.md rename to docs/en/benchmark_results.md diff --git a/docs/en/conf.py b/docs/en/conf.py new file mode 100644 index 000000000..5a7ed2dbf --- /dev/null +++ b/docs/en/conf.py @@ -0,0 +1,125 @@ +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) + +version_file = "../../python/sglang/version.py" +with open(version_file, "r") as f: + exec(compile(f.read(), version_file, "exec")) +__version__ = locals()["__version__"] + +project = "SGLang" +copyright = "2023-2024, SGLang" +author = "SGLang Team" + +version = __version__ +release = __version__ + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.autosectionlabel", + "sphinx.ext.intersphinx", + "sphinx_tabs.tabs", + "myst_parser", + "sphinx_copybutton", + "sphinxcontrib.mermaid", +] + +autosectionlabel_prefix_document = True + +templates_path = ["_templates"] + +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +master_doc = "index" + +language = "en" + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +pygments_style = "sphinx" + +html_theme = "sphinx_book_theme" +html_logo = "_static/image/logo.png" +html_title = project +html_copy_source = True +html_last_updated_fmt = "" + +html_theme_options = { + "path_to_docs": "docs/en", + "repository_url": "https://github.com/sgl-project/sglang", + "repository_branch": "main", + "show_navbar_depth": 3, + "max_navbar_depth": 4, + "collapse_navbar": True, + "use_edit_page_button": True, + "use_source_button": True, + "use_issues_button": True, + "use_repository_button": True, + "use_download_button": True, + "use_sidenotes": True, + "show_toc_level": 2, +} + +html_static_path = ["_static"] +html_css_files = ["css/readthedocs.css"] + +myst_enable_extensions = [ + "dollarmath", + "amsmath", + "deflist", + "colon_fence", +] +myst_heading_anchors = 5 + +htmlhelp_basename = "sglangdoc" + +latex_elements = {} + +latex_documents = [ + (master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"), +] + +man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)] + +texinfo_documents = [ + ( + master_doc, + "sglang", + "sglang Documentation", + author, + "sglang", + "One line description of project.", + "Miscellaneous", + ), +] + +epub_title = project + +epub_exclude_files = ["search.html"] + +copybutton_prompt_text = r">>> |\.\.\. " +copybutton_prompt_is_regexp = True + +autodoc_preserve_defaults = True +navigation_with_keys = False + +autodoc_mock_imports = [ + "torch", + "transformers", + "triton", +] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3.12", None), + "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None), + "pillow": ("https://pillow.readthedocs.io/en/stable", None), + "numpy": ("https://numpy.org/doc/stable", None), + "torch": ("https://pytorch.org/docs/stable", None), +} diff --git a/docs/custom_chat_template.md b/docs/en/custom_chat_template.md similarity index 100% rename from docs/custom_chat_template.md rename to docs/en/custom_chat_template.md diff --git a/docs/hyperparameter_tuning.md b/docs/en/hyperparameter_tuning.md similarity index 100% rename from docs/hyperparameter_tuning.md rename to docs/en/hyperparameter_tuning.md diff --git a/docs/en/index.rst b/docs/en/index.rst new file mode 100644 index 000000000..4621b838b --- /dev/null +++ b/docs/en/index.rst @@ -0,0 +1,65 @@ +Welcome to SGLang's tutorials! +==================================== + +.. figure:: ./_static/image/logo.png + :width: 50% + :align: center + :alt: SGLang + :class: no-scaled-link + +.. raw:: html + +
+ SGLang is yet another fast serving framework for large language models and vision language models. + +
+ + + +SGLang has the following core features: + +* **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin). + +* **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions. + +Documentation +------------- + +.. _hyperparameter_tuning: +.. toctree:: + :maxdepth: 1 + :caption: Hyperparameter Tuning + + hyperparameter_tuning.md + +.. _custom_chat_template: +.. toctree:: + :maxdepth: 1 + :caption: Custom Chat Template + + custom_chat_template.md + +.. _model_support: +.. toctree:: + :maxdepth: 1 + :caption: Model Support + + model_support.md + +.. _sampling_params: +.. toctree:: + :maxdepth: 1 + :caption: Sampling Params + + sampling_params.md + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/docs/model_support.md b/docs/en/model_support.md similarity index 100% rename from docs/model_support.md rename to docs/en/model_support.md diff --git a/docs/release_process.md b/docs/en/release_process.md similarity index 100% rename from docs/release_process.md rename to docs/en/release_process.md diff --git a/docs/sampling_params.md b/docs/en/sampling_params.md similarity index 100% rename from docs/sampling_params.md rename to docs/en/sampling_params.md diff --git a/docs/test_process.md b/docs/en/test_process.md similarity index 100% rename from docs/test_process.md rename to docs/en/test_process.md diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..826a34bc1 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,12 @@ +markdown>=3.4.0 +myst-parser +sphinx +sphinx-book-theme +sphinx-copybutton +sphinx-tabs +sphinxcontrib-mermaid +pillow +pydantic +torch +transformers +urllib3<2.0.0 diff --git a/python/sglang/api.py b/python/sglang/api.py index c32943963..70f992b14 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -75,7 +75,7 @@ def gen( choices: Optional[List[str]] = None, regex: Optional[str] = None, ): - """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md""" + """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md""" if choices: return SglSelect(name, choices, 0.0 if temperature is None else temperature) diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index e5d5e837a..23537f350 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -410,7 +410,7 @@ class SglGen(SglExpr): dtype: Optional[type] = None, regex: Optional[str] = None, ): - """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md""" + """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md""" super().__init__() self.name = name self.sampling_params = SglSamplingParams(