docs: init readthedocs support (#783)

2024-07-28 16:50:31 +10:00
parent 68e5262699
commit 948625799e
16 changed files with 246 additions and 6 deletions
--- a/docs/en/.readthedocs.yaml
+++ b/docs/en/.readthedocs.yaml
@@ -0,0 +1,17 @@
+version: 2
+
+formats: all
+
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.12"
+
+
+sphinx:
+  configuration: docs/en/conf.py
+
+
+python:
+  install:
+    - requirements: docs/requirements.txt
--- a/docs/en/Makefile
+++ b/docs/en/Makefile
@@ -0,0 +1,12 @@
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/en/_static/css/readthedocs.css
+++ b/docs/en/_static/css/readthedocs.css
@@ -0,0 +1,9 @@
+table.autosummary td {
+  width: 50%
+}
+
+img.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
--- a/docs/en/benchmark_results.md
+++ b/docs/en/benchmark_results.md
@@ -0,0 +1,22 @@
+# Benchmark Results
+
+We tested our system on the following common LLM workloads and reported the achieved throughput:
+- **[MMLU](https://arxiv.org/abs/2009.03300)**: A 5-shot, multi-choice, multi-task benchmark.
+- **[HellaSwag](https://arxiv.org/abs/1905.07830)**: A 20-shot, multi-choice sentence completion benchmark.
+- **[ReAct Agent](https://arxiv.org/abs/2210.03629)**: An agent task using prompt traces collected from the original ReAct paper.
+- **[Tree-of-Thought](https://arxiv.org/pdf/2305.10601.pdf)**: A custom tree search-based prompt for solving GSM-8K problems.
+- **JSON Decode**: Extracting information from a Wikipedia page and outputting it in JSON format.
+- **Chat (short)**: A synthetic chat benchmark where each conversation includes 4 turns with short LLM outputs.
+- **Chat (long)**: A synthetic chat benchmark where each conversation includes 4 turns with long LLM outputs.
+- **[DSPy RAG](https://github.com/stanfordnlp/dspy)**: A retrieval-augmented generation pipeline in the DSPy tutorial.
+- **[LLaVA Bench](https://github.com/haotian-liu/LLaVA)**: Running LLaVA v1.5, a vision language model on the LLaVA-in-the-wild benchmark.
+
+We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, Hugging Face TGI v1.3.0, and SGLang v0.1.5.
+
+- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
+![llama_7b](../assets/llama_7b.jpg)
+
+- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
+![mixtral_8x7b](../assets/mixtral_8x7b.jpg)
+
+The benchmark code is available [here](https://github.com/sgl-project/sglang/tree/main/benchmark).
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -0,0 +1,125 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath("../.."))
+
+version_file = "../../python/sglang/version.py"
+with open(version_file, "r") as f:
+    exec(compile(f.read(), version_file, "exec"))
+__version__ = locals()["__version__"]
+
+project = "SGLang"
+copyright = "2023-2024, SGLang"
+author = "SGLang Team"
+
+version = __version__
+release = __version__
+
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.intersphinx",
+    "sphinx_tabs.tabs",
+    "myst_parser",
+    "sphinx_copybutton",
+    "sphinxcontrib.mermaid",
+]
+
+autosectionlabel_prefix_document = True
+
+templates_path = ["_templates"]
+
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "markdown",
+}
+
+master_doc = "index"
+
+language = "en"
+
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+pygments_style = "sphinx"
+
+html_theme = "sphinx_book_theme"
+html_logo = "_static/image/logo.png"
+html_title = project
+html_copy_source = True
+html_last_updated_fmt = ""
+
+html_theme_options = {
+    "path_to_docs": "docs/en",
+    "repository_url": "https://github.com/sgl-project/sglang",
+    "repository_branch": "main",
+    "show_navbar_depth": 3,
+    "max_navbar_depth": 4,
+    "collapse_navbar": True,
+    "use_edit_page_button": True,
+    "use_source_button": True,
+    "use_issues_button": True,
+    "use_repository_button": True,
+    "use_download_button": True,
+    "use_sidenotes": True,
+    "show_toc_level": 2,
+}
+
+html_static_path = ["_static"]
+html_css_files = ["css/readthedocs.css"]
+
+myst_enable_extensions = [
+    "dollarmath",
+    "amsmath",
+    "deflist",
+    "colon_fence",
+]
+myst_heading_anchors = 5
+
+htmlhelp_basename = "sglangdoc"
+
+latex_elements = {}
+
+latex_documents = [
+    (master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"),
+]
+
+man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)]
+
+texinfo_documents = [
+    (
+        master_doc,
+        "sglang",
+        "sglang Documentation",
+        author,
+        "sglang",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
+]
+
+epub_title = project
+
+epub_exclude_files = ["search.html"]
+
+copybutton_prompt_text = r">>> |\.\.\. "
+copybutton_prompt_is_regexp = True
+
+autodoc_preserve_defaults = True
+navigation_with_keys = False
+
+autodoc_mock_imports = [
+    "torch",
+    "transformers",
+    "triton",
+]
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3.12", None),
+    "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "torch": ("https://pytorch.org/docs/stable", None),
+}
--- a/docs/en/custom_chat_template.md
+++ b/docs/en/custom_chat_template.md
@@ -0,0 +1,28 @@
+# Custom Chat Template in SGLang Runtime
+
+By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
+
+If needed, you can also override the chat template when launching the server:
+
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
+```
+
+If the chat template you are looking for is missing, you are welcome to contribute it.
+Meanwhile, you can also temporarily register your chat template as follows:
+
+```json
+{
+  "name": "my_model",
+  "system": "<|im_start|>system",
+  "user": "<|im_start|>user",
+  "assistant": "<|im_start|>assistant",
+  "sep_style": "CHATML",
+  "sep": "<|im_end|>",
+  "stop_str": ["<|im_end|>", "<|im_start|>"]
+}
+```
+
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.json
+```
--- a/docs/en/hyperparameter_tuning.md
+++ b/docs/en/hyperparameter_tuning.md
@@ -0,0 +1,35 @@
+# Guide on Hyperparameter Tuning
+
+## Achieving Peak Throughput
+
+Achieving a large batch size is the most important thing for attaining high throughput.
+
+When the server is running at full load, look for the following in the log:
+
+```[gpu_id=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
+
+### Tune Your Request Submission Speed
+`#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed.
+A healthy range for `#queue-req` is `100 - 3000`.
+
+### Tune `--schedule-conservativeness`
+`token usage` indicates the KV cache memory utilization of the server. `token usage > 0.9` means good utilization.
+If you frequently see `token usage < 0.9` and `#queue-req > 0`, it means the server is too conservative about taking in new requests. You can decrease `--schedule-conservativeness` to a value like 0.3.
+The case of serving being too conservative can happen when users send many requests with a large `max_new_tokens` but the requests stop very early due to EOS or stop strings.
+
+On the other hand, if you see `token usage` very high and you frequently see warnings like
+`decode out of memory happened, #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3.
+
+### Tune `--dp-size` and `--tp-size`
+Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput.
+
+### (Minor) Tune `--max-prefill-tokens`, `--mem-fraction-static`, `--max-running-requests`
+If you see out of memory (OOM) errors, you can decrease these parameters.  
+If OOM happens during prefill, try to decrease `--max-prefill-tokens`.  
+If OOM happens during decoding, try to decrease `--max-running-requests`.  
+You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
+
+### (Minor) Tune `--schedule-heuristic`
+If you have many shared prefixes, use the default `--schedule-heuristic lpm`. `lpm` stands for longest prefix match.
+When you have no shared prefixes at all or you always send the requests with the shared prefixes together,
+you can try `--schedule-heuristic fcfs`. `fcfs` stands for first come first serve.
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -0,0 +1,65 @@
+Welcome to SGLang's tutorials!
+====================================
+
+.. figure:: ./_static/image/logo.png
+  :width: 50%
+  :align: center
+  :alt: SGLang
+  :class: no-scaled-link
+
+.. raw:: html
+
+   <p style="text-align:center">
+   <strong>SGLang is yet another fast serving framework for large language models and vision language models.
+   </strong>
+   </p>
+
+   <p style="text-align:center">
+   <script async defer src="https://buttons.github.io/buttons.js"></script>
+   <a class="github-button" href="https://github.com/sgl-project/sglang" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+   <a class="github-button" href="https://github.com/sgl-project/sglang/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+   <a class="github-button" href="https://github.com/sgl-project/sglang/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+   </p>
+
+SGLang has the following core features:
+
+* **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
+
+* **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
+
+Documentation
+-------------
+
+.. _hyperparameter_tuning:
+.. toctree::
+   :maxdepth: 1
+   :caption: Hyperparameter Tuning
+
+   hyperparameter_tuning.md
+
+.. _custom_chat_template:
+.. toctree::
+   :maxdepth: 1
+   :caption: Custom Chat Template
+
+   custom_chat_template.md
+
+.. _model_support:
+.. toctree::
+   :maxdepth: 1
+   :caption: Model Support
+
+   model_support.md
+
+.. _sampling_params:
+.. toctree::
+   :maxdepth: 1
+   :caption: Sampling Params
+
+   sampling_params.md
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
--- a/docs/en/model_support.md
+++ b/docs/en/model_support.md
@@ -0,0 +1,16 @@
+# How to Support a New Model
+
+To support a new model in SGLang, you only need to add a single file under [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models). You can learn from existing model implementations and create new files for the new models. Most models are based on the transformer architecture, making them very similar.
+
+Another valuable resource is the [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models). vLLM has extensive coverage of models, and SGLang has reused vLLM for most parts of the model implementations. This similarity makes it easy to port many models from vLLM to SGLang.
+
+To port a model from vLLM to SGLang, you can compare these two files [SGLang LLaMA Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama2.py) and [vLLM LLaMA Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). This comparison will help you understand how to convert a model implementation from vLLM to SGLang. The major difference is the replacement of PagedAttention with RadixAttention. The other parts are almost identical. Specifically,
+  - Replace vllm's `Attention` with `RadixAttention`.
+  - Replace vllm's `LogitsProcessor` with SGLang's `LogitsProcessor`.
+  - Remove `Sample`.
+  - Change `forward()` functions, and add `input_metadata`.
+  - Add `EntryClass` at the end.
+  - Test correctness by comparing the final logits and outputs of the two following commands:
+    - `python3 playground/reference_hf.py --model [new model]`
+    - `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
+  - Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md).
--- a/docs/en/release_process.md
+++ b/docs/en/release_process.md
@@ -0,0 +1,18 @@
+# PyPI Package Release Process
+
+## Update the version in code
+Update the package version in `python/pyproject.toml` and `python/sglang/__init__.py`.
+
+## Upload the PyPI package
+
+```
+pip install build twine
+```
+
+```
+cd python
+bash upload_pypi.sh
+```
+
+## Make a release in GitHub
+Make a new release https://github.com/sgl-project/sglang/releases/new.
--- a/docs/en/sampling_params.md
+++ b/docs/en/sampling_params.md
@@ -0,0 +1,144 @@
+# Sampling Parameters in SGLang Runtime
+This doc describes the sampling parameters of the SGLang Runtime.
+
+The `/generate` endpoint accepts the following arguments in the JSON format.
+
+```python
+@dataclass
+class GenerateReqInput:
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[str], str]] = None
+    # The token ids for text; one can either specify text or input_ids.
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The image input. It can be a file name, a url, or base64 encoded string.
+    # See also python/sglang/srt/utils.py:load_image.
+    image_data: Optional[Union[List[str], str]] = None
+    # The sampling_params. See descriptions below.
+    sampling_params: Union[List[Dict], Dict] = None
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Whether to return logprobs.
+    return_logprob: Optional[Union[List[bool], bool]] = None
+    # The start location of the prompt for return_logprob.
+    logprob_start_len: Optional[Union[List[int], int]] = None
+    # The number of top logprobs to return.
+    top_logprobs_num: Optional[Union[List[int], int]] = None
+    # Whether to detokenize tokens in text in the returned logprobs.
+    return_text_in_logprobs: bool = False
+    # Whether to stream output.
+    stream: bool = False
+```
+
+The `sampling_params` follows this format
+
+```python
+# The maximum number of output tokens
+max_new_tokens: int = 16,
+# Stop when hitting any of the strings in this list.
+stop: Optional[Union[str, List[str]]] = None,
+# Sampling temperature
+temperature: float = 1.0,
+# Top-p sampling
+top_p: float = 1.0,
+# Top-k sampling
+top_k: int = -1,
+# Whether to ignore EOS token.
+ignore_eos: bool = False,
+# Whether to skip the special tokens during detokenization.
+skip_special_tokens: bool = True,
+# Whether to add spaces between special tokens during detokenization.
+spaces_between_special_tokens: bool = True,
+# Constrains the output to follow a given regular expression.
+regex: Optional[str] = None,
+# Do parallel sampling and return `n` outputs.
+n: int = 1,
+```
+
+## Examples
+
+### Normal
+Launch a server
+```
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
+```
+
+Send a request
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+
+### Streaming
+Send a request and stream the output
+```python
+import requests, json
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 256,
+        },
+        "stream": True,
+    },
+    stream=True,
+)
+
+prev = 0
+for chunk in response.iter_lines(decode_unicode=False):
+    chunk = chunk.decode("utf-8")
+    if chunk and chunk.startswith("data:"):
+        if chunk == "data: [DONE]":
+            break
+        data = json.loads(chunk[5:].strip("\n"))
+        output = data["text"].strip()
+        print(output[prev:], end="", flush=True)
+        prev = len(output)
+print("")
+```
+
+### Multi modal
+
+Launch a server
+```
+python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000
+```
+
+Download an image
+```
+curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true
+```
+
+Send a request
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:",
+        "image_data": "example_image.png",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+
+The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
+Streaming is supported in a similar manner as [above](#streaming).
--- a/docs/en/test_process.md
+++ b/docs/en/test_process.md
@@ -0,0 +1,100 @@
+# SRT Unit Tests
+
+### Latency Alignment
+Make sure your changes do not slow down the following benchmarks
+```
+# single gpu
+python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256
+python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256
+
+# multiple gpu
+python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1
+python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32
+
+# moe model
+python -m sglang.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32
+```
+
+### High-level API
+
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+cd test/lang
+python3 test_srt_backend.py
+```
+
+### Performance
+
+#### MMLU
+```
+cd benchmark/mmlu
+```
+Follow README.md to download the data.
+
+```
+python3 bench_sglang.py --nsub 3
+
+# Expected performance on A10G
+# Total latency: 8.200
+# Average accuracy: 0.413
+```
+
+#### GSM-8K
+```
+cd benchmark/gsm8k
+```
+Follow README.md to download the data.
+
+```
+python3 bench_sglang.py --num-q 200
+
+# Expected performance on A10G
+# Latency: 32.103
+# Accuracy: 0.250
+```
+
+#### More
+Please also test `benchmark/hellaswag`, `benchmark/latency_throughput`.
+
+### More Models
+
+#### LLaVA
+
+```
+python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
+```
+
+```
+cd benchmark/llava_bench
+python3 bench_sglang.py
+
+# Expected performance on A10G
+# Latency: 50.031
+```
+
+## SGLang Unit Tests
+```
+export ANTHROPIC_API_KEY=
+export OPENAI_API_KEY=
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+cd test/lang
+python3 run_all.py
+```
+
+## OpenAI API server
+```
+cd test/srt
+python test_openai_server.py
+```
+
+## Format
+pip3 install pre-commit
+cd sglang
+pre-commit install
+pre-commit run --all-files