Update ci workflows (#1804)
This commit is contained in:
43
.github/workflows/deploy-docs.yml
vendored
43
.github/workflows/deploy-docs.yml
vendored
@@ -1,20 +1,15 @@
|
|||||||
name: Build Documentation
|
name: Build Documentation
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
|
||||||
branches: [ main ]
|
|
||||||
pull_request:
|
|
||||||
branches: [ main ]
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
execute-notebooks:
|
execute-notebooks:
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
|
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- name: Checkout code
|
||||||
with:
|
uses: actions/checkout@v3
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
@@ -23,22 +18,14 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install -r docs/requirements.txt
|
pip install -r docs/requirements.txt
|
||||||
pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Setup Jupyter Kernel
|
- name: Setup Jupyter Kernel
|
||||||
run: |
|
run: |
|
||||||
python -m ipykernel install --user --name python3 --display-name "Python 3"
|
python -m ipykernel install --user --name python3 --display-name "Python 3"
|
||||||
|
|
||||||
- name: Execute notebooks
|
- name: Execute notebooks
|
||||||
env:
|
|
||||||
HF_HOME: /hf_home
|
|
||||||
SGLANG_IS_IN_CI: true
|
|
||||||
CUDA_VISIBLE_DEVICES: 0
|
|
||||||
run: |
|
run: |
|
||||||
cd docs/en
|
cd docs/en
|
||||||
for nb in *.ipynb; do
|
for nb in *.ipynb; do
|
||||||
@@ -54,34 +41,18 @@ jobs:
|
|||||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- name: Checkout code
|
||||||
with:
|
uses: actions/checkout@v3
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.9'
|
python-version: '3.9'
|
||||||
|
|
||||||
- name: Cache Python dependencies
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
path: ~/.cache/pip
|
|
||||||
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-pip-
|
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install -r docs/requirements.txt
|
pip install -r docs/requirements.txt
|
||||||
pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Install Pandoc
|
|
||||||
run: |
|
|
||||||
apt-get update
|
apt-get update
|
||||||
apt-get install -y pandoc
|
apt-get install -y pandoc
|
||||||
|
|
||||||
|
|||||||
6
.github/workflows/lint.yml
vendored
6
.github/workflows/lint.yml
vendored
@@ -8,10 +8,10 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
- name: Set up Python 3.9
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: 3.9
|
python-version: '3.9'
|
||||||
|
|
||||||
- name: Install pre-commit hook
|
- name: Install pre-commit hook
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
4
.github/workflows/nightly-eval.yml
vendored
4
.github/workflows/nightly-eval.yml
vendored
@@ -24,9 +24,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Nightly gsm8k Accuracy
|
- name: Nightly gsm8k Accuracy
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
|
|||||||
50
.github/workflows/pr-test.yml
vendored
50
.github/workflows/pr-test.yml
vendored
@@ -27,10 +27,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[dev]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Run test
|
- name: Run test
|
||||||
timeout-minutes: 10
|
timeout-minutes: 10
|
||||||
@@ -47,10 +44,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[dev]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Run test
|
- name: Run test
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
@@ -67,10 +61,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[dev]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Run test
|
- name: Run test
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
@@ -87,10 +78,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[dev]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Run test
|
- name: Run test
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
@@ -107,10 +95,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[dev]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Run test
|
- name: Run test
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
@@ -127,10 +112,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Benchmark Single Latency
|
- name: Benchmark Single Latency
|
||||||
timeout-minutes: 10
|
timeout-minutes: 10
|
||||||
@@ -165,10 +147,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Benchmark Offline Throughput (w/o RadixAttention)
|
- name: Benchmark Offline Throughput (w/o RadixAttention)
|
||||||
timeout-minutes: 10
|
timeout-minutes: 10
|
||||||
@@ -197,10 +176,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
- name: Benchmark Offline Throughput (TP=2)
|
- name: Benchmark Offline Throughput (TP=2)
|
||||||
timeout-minutes: 10
|
timeout-minutes: 10
|
||||||
@@ -229,10 +205,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
git clone https://github.com/merrymercy/human-eval.git
|
git clone https://github.com/merrymercy/human-eval.git
|
||||||
cd human-eval
|
cd human-eval
|
||||||
@@ -253,10 +226,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -e "python[all]"
|
|
||||||
pip install transformers==4.45.2
|
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
|
||||||
|
|
||||||
git clone https://github.com/merrymercy/human-eval.git
|
git clone https://github.com/merrymercy/human-eval.git
|
||||||
cd human-eval
|
cd human-eval
|
||||||
|
|||||||
25
.github/workflows/release-github.yml
vendored
25
.github/workflows/release-github.yml
vendored
@@ -1,25 +0,0 @@
|
|||||||
name: Release GitHub
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
jobs:
|
|
||||||
publish:
|
|
||||||
if: github.repository == 'sgl-project/sglang'
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
environment: 'prod'
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Get version
|
|
||||||
id: get_version
|
|
||||||
run: |
|
|
||||||
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
|
||||||
echo "TAG=v$version" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Release
|
|
||||||
uses: softprops/action-gh-release@v1
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ secrets.REPO_TOKEN }}
|
|
||||||
with:
|
|
||||||
name: Release ${{ steps.get_version.outputs.TAG }}
|
|
||||||
tag_name: ${{ steps.get_version.outputs.TAG }}
|
|
||||||
6
.github/workflows/release-pypi.yml
vendored
6
.github/workflows/release-pypi.yml
vendored
@@ -13,12 +13,14 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
environment: 'prod'
|
environment: 'prod'
|
||||||
steps:
|
steps:
|
||||||
- name: Set up python3.8
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.8'
|
python-version: '3.9'
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Upload to pypi
|
- name: Upload to pypi
|
||||||
run: |
|
run: |
|
||||||
cd python
|
cd python
|
||||||
|
|||||||
23
docs/Makefile
Normal file
23
docs/Makefile
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Minimal makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line, and also
|
||||||
|
# from the environment for the first two.
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = .
|
||||||
|
BUILDDIR = _build
|
||||||
|
|
||||||
|
# Put it first so that "make" without argument is like "make help".
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(BUILDDIR)/*
|
||||||
33
docs/README.md
Normal file
33
docs/README.md
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# SGLang Documentation
|
||||||
|
|
||||||
|
## Build the documentation website
|
||||||
|
|
||||||
|
### Dependency
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### Build
|
||||||
|
```
|
||||||
|
make html
|
||||||
|
```
|
||||||
|
|
||||||
|
### Clean
|
||||||
|
To remove all generated files:
|
||||||
|
```
|
||||||
|
make clean
|
||||||
|
```
|
||||||
|
|
||||||
|
### Serve (preview)
|
||||||
|
Run an HTTP server and visit http://localhost:8000 in your browser.
|
||||||
|
```
|
||||||
|
python3 -m http.server --d _build/html
|
||||||
|
```
|
||||||
|
|
||||||
|
### Deploy
|
||||||
|
Clone [sgl-project.github.io](https://github.com/sgl-project/sgl-project.github.io) and make sure you have write access.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export DOC_SITE_PATH=../../sgl-project.github.io # update this with your path
|
||||||
|
python3 deploy.py
|
||||||
|
```
|
||||||
|
Before Width: | Height: | Size: 393 KiB After Width: | Height: | Size: 393 KiB |
@@ -20,7 +20,7 @@ curl http://localhost:30000/generate \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
|
Learn more about the argument specification, streaming, and multi-modal support [here](https://sgl-project.github.io/sampling_params.html).
|
||||||
|
|
||||||
### OpenAI Compatible API
|
### OpenAI Compatible API
|
||||||
In addition, the server supports OpenAI-compatible APIs.
|
In addition, the server supports OpenAI-compatible APIs.
|
||||||
@@ -74,7 +74,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
```
|
```
|
||||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
|
||||||
```
|
```
|
||||||
- See [hyperparameter tuning](https://sglang.readthedocs.io/en/latest/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
|
- See [hyperparameter tuning](https://sgl-project.github.io/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
|
||||||
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
||||||
```
|
```
|
||||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
||||||
@@ -83,7 +83,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
||||||
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||||
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
||||||
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sgl-project.github.io/custom_chat_template.html).
|
||||||
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
||||||
```
|
```
|
||||||
# Node 0
|
# Node 0
|
||||||
@@ -158,7 +158,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|||||||
- gte-Qwen2
|
- gte-Qwen2
|
||||||
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
||||||
|
|
||||||
Instructions for supporting a new model are [here](https://sglang.readthedocs.io/en/latest/model_support.html).
|
Instructions for supporting a new model are [here](https://sgl-project.github.io/model_support.html).
|
||||||
|
|
||||||
#### Use Models From ModelScope
|
#### Use Models From ModelScope
|
||||||
<details>
|
<details>
|
||||||
@@ -3,7 +3,7 @@ import sys
|
|||||||
|
|
||||||
sys.path.insert(0, os.path.abspath("../.."))
|
sys.path.insert(0, os.path.abspath("../.."))
|
||||||
|
|
||||||
version_file = "../../python/sglang/version.py"
|
version_file = "../python/sglang/version.py"
|
||||||
with open(version_file, "r") as f:
|
with open(version_file, "r") as f:
|
||||||
exec(compile(f.read(), version_file, "exec"))
|
exec(compile(f.read(), version_file, "exec"))
|
||||||
__version__ = locals()["__version__"]
|
__version__ = locals()["__version__"]
|
||||||
@@ -11,4 +11,4 @@ pre-commit run --all-files
|
|||||||
```
|
```
|
||||||
|
|
||||||
## Add Unit Tests
|
## Add Unit Tests
|
||||||
Add unit tests under [sglang/test](../../test). You can learn how to add and run tests from the README.md in that folder.
|
Add unit tests under [sglang/test](https://github.com/sgl-project/sglang/tree/main/test). You can learn how to add and run tests from the README.md in that folder.
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# Custom Chat Template in SGLang Runtime
|
# Custom Chat Template in SGLang Runtime
|
||||||
|
|
||||||
**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](../../python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](../../python/sglang/lang/chat_template.py)).
|
**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
|
||||||
|
|
||||||
By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
|
||||||
It should just work for most official models such as Llama-2/Llama-3.
|
It should just work for most official models such as Llama-2/Llama-3.
|
||||||
22
docs/deploy.py
Normal file
22
docs/deploy.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def run_cmd(cmd):
|
||||||
|
print(cmd)
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
run_cmd("cd $DOC_SITE_PATH; git pull")
|
||||||
|
|
||||||
|
# (Optional) Remove old files
|
||||||
|
# run_cmd("rm -rf $ALPA_SITE_PATH/*")
|
||||||
|
|
||||||
|
run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
|
||||||
|
|
||||||
|
cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
run_cmd(
|
||||||
|
f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
|
||||||
|
)
|
||||||
0
docs/deploy_docs.sh
Normal file
0
docs/deploy_docs.sh
Normal file
@@ -1,12 +0,0 @@
|
|||||||
SPHINXOPTS =
|
|
||||||
SPHINXBUILD = sphinx-build
|
|
||||||
SOURCEDIR = .
|
|
||||||
BUILDDIR = _build
|
|
||||||
|
|
||||||
help:
|
|
||||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
||||||
|
|
||||||
.PHONY: help Makefile
|
|
||||||
|
|
||||||
%: Makefile
|
|
||||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
||||||
@@ -10,24 +10,28 @@ The core features include:
|
|||||||
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
||||||
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: Getting Started
|
:caption: Getting Started
|
||||||
|
|
||||||
install.md
|
install.md
|
||||||
send_request.ipynb
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: Backend Tutorial
|
:caption: Backend Tutorial
|
||||||
|
|
||||||
backend.md
|
backend.md
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: Frontend Tutorial
|
:caption: Frontend Tutorial
|
||||||
|
|
||||||
frontend.md
|
frontend.md
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: References
|
:caption: References
|
||||||
@@ -39,4 +43,3 @@ The core features include:
|
|||||||
choices_methods.md
|
choices_methods.md
|
||||||
benchmark_and_profiling.md
|
benchmark_and_profiling.md
|
||||||
troubleshooting.md
|
troubleshooting.md
|
||||||
embedding_model.ipynb
|
|
||||||
@@ -48,9 +48,9 @@ docker run --gpus all \
|
|||||||
<summary>More</summary>
|
<summary>More</summary>
|
||||||
|
|
||||||
> This method is recommended if you plan to serve it as a service.
|
> This method is recommended if you plan to serve it as a service.
|
||||||
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
> A better approach is to use the [k8s-sglang-service.yaml](https://github.com/sgl-project/sglang/blob/main/docker/k8s-sglang-service.yaml).
|
||||||
|
|
||||||
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
1. Copy the [compose.yml](https://github.com/sgl-project/sglang/blob/main/docker/compose.yaml) to your local machine
|
||||||
2. Execute the command `docker compose up -d` in your terminal.
|
2. Execute the command `docker compose up -d` in your terminal.
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -1,12 +1,17 @@
|
|||||||
|
ipykernel
|
||||||
|
ipywidgets
|
||||||
|
jupyter_client
|
||||||
markdown>=3.4.0
|
markdown>=3.4.0
|
||||||
|
matplotlib
|
||||||
myst-parser
|
myst-parser
|
||||||
|
nbconvert
|
||||||
|
nbsphinx
|
||||||
|
pandoc
|
||||||
|
pillow
|
||||||
|
pydantic
|
||||||
sphinx
|
sphinx
|
||||||
sphinx-book-theme
|
sphinx-book-theme
|
||||||
sphinx-copybutton
|
sphinx-copybutton
|
||||||
sphinx-tabs
|
sphinx-tabs
|
||||||
sphinxcontrib-mermaid
|
sphinxcontrib-mermaid
|
||||||
pillow
|
urllib3<2.0.0
|
||||||
pydantic
|
|
||||||
urllib3<2.0.0
|
|
||||||
nbsphinx
|
|
||||||
pandoc
|
|
||||||
@@ -194,7 +194,7 @@ Since we compute penalty algorithms through CUDA, the logic stores relevant para
|
|||||||
|
|
||||||
You can run your own benchmark with desired parameters on your own hardware to make sure it's not OOMing before using.
|
You can run your own benchmark with desired parameters on your own hardware to make sure it's not OOMing before using.
|
||||||
|
|
||||||
Tuning `--mem-fraction-static` and/or `--max-running-requests` will help. See [here](hyperparameter_tuning.md#minor-tune---max-prefill-tokens---mem-fraction-static---max-running-requests) for more information.
|
Tuning `--mem-fraction-static` and/or `--max-running-requests` will help.
|
||||||
|
|
||||||
### Benchmarks
|
### Benchmarks
|
||||||
|
|
||||||
1
docs/serve.sh
Normal file
1
docs/serve.sh
Normal file
@@ -0,0 +1 @@
|
|||||||
|
python3 -m http.server --d _build/html
|
||||||
@@ -5,9 +5,9 @@ This page lists some common errors and tips for fixing them.
|
|||||||
## CUDA error: an illegal memory access was encountered
|
## CUDA error: an illegal memory access was encountered
|
||||||
This error may be due to kernel errors or out-of-memory issues.
|
This error may be due to kernel errors or out-of-memory issues.
|
||||||
- If it is a kernel error, it is not easy to fix.
|
- If it is a kernel error, it is not easy to fix.
|
||||||
- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9. https://github.com/sgl-project/sglang/blob/1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621/python/sglang/srt/server_args.py#L92-L102
|
- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9.
|
||||||
|
|
||||||
## The server hangs
|
## The server hangs
|
||||||
If the server hangs, try disabling some optimizations when launching the server.
|
If the server hangs, try disabling some optimizations when launching the server.
|
||||||
- Add `--disable-cuda-graph`.
|
- Add `--disable-cuda-graph`.
|
||||||
- Add `--disable-flashinfer-sampling`.
|
- Add `--sampling-backend pytorch`.
|
||||||
4
scripts/ci_install_dependency.sh
Normal file
4
scripts/ci_install_dependency.sh
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
pip install --upgrade pip
|
||||||
|
pip install -e "python[all]"
|
||||||
|
pip install transformers==4.45.2
|
||||||
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||||
Reference in New Issue
Block a user