Update ci workflows (#1804)
This commit is contained in:
43
.github/workflows/deploy-docs.yml
vendored
43
.github/workflows/deploy-docs.yml
vendored
@@ -1,20 +1,15 @@
|
||||
name: Build Documentation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
execute-notebooks:
|
||||
runs-on: 1-gpu-runner
|
||||
if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
|
||||
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
@@ -23,22 +18,14 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
bash scripts/ci_install_dependency.sh
|
||||
pip install -r docs/requirements.txt
|
||||
pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
|
||||
- name: Setup Jupyter Kernel
|
||||
run: |
|
||||
python -m ipykernel install --user --name python3 --display-name "Python 3"
|
||||
|
||||
- name: Execute notebooks
|
||||
env:
|
||||
HF_HOME: /hf_home
|
||||
SGLANG_IS_IN_CI: true
|
||||
CUDA_VISIBLE_DEVICES: 0
|
||||
run: |
|
||||
cd docs/en
|
||||
for nb in *.ipynb; do
|
||||
@@ -54,34 +41,18 @@ jobs:
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
runs-on: 1-gpu-runner
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.9'
|
||||
|
||||
- name: Cache Python dependencies
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-pip-
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
bash scripts/ci_install_dependency.sh
|
||||
pip install -r docs/requirements.txt
|
||||
pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
|
||||
- name: Install Pandoc
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y pandoc
|
||||
|
||||
|
||||
6
.github/workflows/lint.yml
vendored
6
.github/workflows/lint.yml
vendored
@@ -8,10 +8,10 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Set up Python 3.9
|
||||
uses: actions/setup-python@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.9
|
||||
python-version: '3.9'
|
||||
|
||||
- name: Install pre-commit hook
|
||||
run: |
|
||||
|
||||
4
.github/workflows/nightly-eval.yml
vendored
4
.github/workflows/nightly-eval.yml
vendored
@@ -24,9 +24,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Nightly gsm8k Accuracy
|
||||
timeout-minutes: 60
|
||||
|
||||
50
.github/workflows/pr-test.yml
vendored
50
.github/workflows/pr-test.yml
vendored
@@ -27,10 +27,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[dev]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 10
|
||||
@@ -47,10 +44,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[dev]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -67,10 +61,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[dev]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -87,10 +78,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[dev]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -107,10 +95,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[dev]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
@@ -127,10 +112,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark Single Latency
|
||||
timeout-minutes: 10
|
||||
@@ -165,10 +147,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark Offline Throughput (w/o RadixAttention)
|
||||
timeout-minutes: 10
|
||||
@@ -197,10 +176,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Benchmark Offline Throughput (TP=2)
|
||||
timeout-minutes: 10
|
||||
@@ -229,10 +205,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
git clone https://github.com/merrymercy/human-eval.git
|
||||
cd human-eval
|
||||
@@ -253,10 +226,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
git clone https://github.com/merrymercy/human-eval.git
|
||||
cd human-eval
|
||||
|
||||
25
.github/workflows/release-github.yml
vendored
25
.github/workflows/release-github.yml
vendored
@@ -1,25 +0,0 @@
|
||||
name: Release GitHub
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
publish:
|
||||
if: github.repository == 'sgl-project/sglang'
|
||||
runs-on: ubuntu-latest
|
||||
environment: 'prod'
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Get version
|
||||
id: get_version
|
||||
run: |
|
||||
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||
echo "TAG=v$version" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.REPO_TOKEN }}
|
||||
with:
|
||||
name: Release ${{ steps.get_version.outputs.TAG }}
|
||||
tag_name: ${{ steps.get_version.outputs.TAG }}
|
||||
6
.github/workflows/release-pypi.yml
vendored
6
.github/workflows/release-pypi.yml
vendored
@@ -13,12 +13,14 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
environment: 'prod'
|
||||
steps:
|
||||
- name: Set up python3.8
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
python-version: '3.9'
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Upload to pypi
|
||||
run: |
|
||||
cd python
|
||||
|
||||
23
docs/Makefile
Normal file
23
docs/Makefile
Normal file
@@ -0,0 +1,23 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
clean:
|
||||
rm -rf $(BUILDDIR)/*
|
||||
33
docs/README.md
Normal file
33
docs/README.md
Normal file
@@ -0,0 +1,33 @@
|
||||
# SGLang Documentation
|
||||
|
||||
## Build the documentation website
|
||||
|
||||
### Dependency
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Build
|
||||
```
|
||||
make html
|
||||
```
|
||||
|
||||
### Clean
|
||||
To remove all generated files:
|
||||
```
|
||||
make clean
|
||||
```
|
||||
|
||||
### Serve (preview)
|
||||
Run an HTTP server and visit http://localhost:8000 in your browser.
|
||||
```
|
||||
python3 -m http.server --d _build/html
|
||||
```
|
||||
|
||||
### Deploy
|
||||
Clone [sgl-project.github.io](https://github.com/sgl-project/sgl-project.github.io) and make sure you have write access.
|
||||
|
||||
```bash
|
||||
export DOC_SITE_PATH=../../sgl-project.github.io # update this with your path
|
||||
python3 deploy.py
|
||||
```
|
||||
|
Before Width: | Height: | Size: 393 KiB After Width: | Height: | Size: 393 KiB |
@@ -20,7 +20,7 @@ curl http://localhost:30000/generate \
|
||||
}'
|
||||
```
|
||||
|
||||
Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
|
||||
Learn more about the argument specification, streaming, and multi-modal support [here](https://sgl-project.github.io/sampling_params.html).
|
||||
|
||||
### OpenAI Compatible API
|
||||
In addition, the server supports OpenAI-compatible APIs.
|
||||
@@ -74,7 +74,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
|
||||
```
|
||||
- See [hyperparameter tuning](https://sglang.readthedocs.io/en/latest/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
|
||||
- See [hyperparameter tuning](https://sgl-project.github.io/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
|
||||
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
||||
```
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
||||
@@ -83,7 +83,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
||||
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
||||
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
||||
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
|
||||
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sgl-project.github.io/custom_chat_template.html).
|
||||
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
||||
```
|
||||
# Node 0
|
||||
@@ -158,7 +158,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
||||
- gte-Qwen2
|
||||
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
||||
|
||||
Instructions for supporting a new model are [here](https://sglang.readthedocs.io/en/latest/model_support.html).
|
||||
Instructions for supporting a new model are [here](https://sgl-project.github.io/model_support.html).
|
||||
|
||||
#### Use Models From ModelScope
|
||||
<details>
|
||||
@@ -3,7 +3,7 @@ import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath("../.."))
|
||||
|
||||
version_file = "../../python/sglang/version.py"
|
||||
version_file = "../python/sglang/version.py"
|
||||
with open(version_file, "r") as f:
|
||||
exec(compile(f.read(), version_file, "exec"))
|
||||
__version__ = locals()["__version__"]
|
||||
@@ -11,4 +11,4 @@ pre-commit run --all-files
|
||||
```
|
||||
|
||||
## Add Unit Tests
|
||||
Add unit tests under [sglang/test](../../test). You can learn how to add and run tests from the README.md in that folder.
|
||||
Add unit tests under [sglang/test](https://github.com/sgl-project/sglang/tree/main/test). You can learn how to add and run tests from the README.md in that folder.
|
||||
@@ -1,6 +1,6 @@
|
||||
# Custom Chat Template in SGLang Runtime
|
||||
|
||||
**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](../../python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](../../python/sglang/lang/chat_template.py)).
|
||||
**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
|
||||
|
||||
By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
|
||||
It should just work for most official models such as Llama-2/Llama-3.
|
||||
22
docs/deploy.py
Normal file
22
docs/deploy.py
Normal file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def run_cmd(cmd):
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
|
||||
run_cmd("cd $DOC_SITE_PATH; git pull")
|
||||
|
||||
# (Optional) Remove old files
|
||||
# run_cmd("rm -rf $ALPA_SITE_PATH/*")
|
||||
|
||||
run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
|
||||
|
||||
cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||
run_cmd(
|
||||
f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
|
||||
)
|
||||
0
docs/deploy_docs.sh
Normal file
0
docs/deploy_docs.sh
Normal file
@@ -1,12 +0,0 @@
|
||||
SPHINXOPTS =
|
||||
SPHINXBUILD = sphinx-build
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@@ -10,24 +10,28 @@ The core features include:
|
||||
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
||||
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Getting Started
|
||||
|
||||
install.md
|
||||
send_request.ipynb
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Backend Tutorial
|
||||
|
||||
backend.md
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Frontend Tutorial
|
||||
|
||||
frontend.md
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: References
|
||||
@@ -39,4 +43,3 @@ The core features include:
|
||||
choices_methods.md
|
||||
benchmark_and_profiling.md
|
||||
troubleshooting.md
|
||||
embedding_model.ipynb
|
||||
@@ -48,9 +48,9 @@ docker run --gpus all \
|
||||
<summary>More</summary>
|
||||
|
||||
> This method is recommended if you plan to serve it as a service.
|
||||
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
||||
> A better approach is to use the [k8s-sglang-service.yaml](https://github.com/sgl-project/sglang/blob/main/docker/k8s-sglang-service.yaml).
|
||||
|
||||
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
||||
1. Copy the [compose.yml](https://github.com/sgl-project/sglang/blob/main/docker/compose.yaml) to your local machine
|
||||
2. Execute the command `docker compose up -d` in your terminal.
|
||||
</details>
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
ipykernel
|
||||
ipywidgets
|
||||
jupyter_client
|
||||
markdown>=3.4.0
|
||||
matplotlib
|
||||
myst-parser
|
||||
nbconvert
|
||||
nbsphinx
|
||||
pandoc
|
||||
pillow
|
||||
pydantic
|
||||
sphinx
|
||||
sphinx-book-theme
|
||||
sphinx-copybutton
|
||||
sphinx-tabs
|
||||
sphinxcontrib-mermaid
|
||||
pillow
|
||||
pydantic
|
||||
urllib3<2.0.0
|
||||
nbsphinx
|
||||
pandoc
|
||||
urllib3<2.0.0
|
||||
@@ -194,7 +194,7 @@ Since we compute penalty algorithms through CUDA, the logic stores relevant para
|
||||
|
||||
You can run your own benchmark with desired parameters on your own hardware to make sure it's not OOMing before using.
|
||||
|
||||
Tuning `--mem-fraction-static` and/or `--max-running-requests` will help. See [here](hyperparameter_tuning.md#minor-tune---max-prefill-tokens---mem-fraction-static---max-running-requests) for more information.
|
||||
Tuning `--mem-fraction-static` and/or `--max-running-requests` will help.
|
||||
|
||||
### Benchmarks
|
||||
|
||||
1
docs/serve.sh
Normal file
1
docs/serve.sh
Normal file
@@ -0,0 +1 @@
|
||||
python3 -m http.server --d _build/html
|
||||
@@ -5,9 +5,9 @@ This page lists some common errors and tips for fixing them.
|
||||
## CUDA error: an illegal memory access was encountered
|
||||
This error may be due to kernel errors or out-of-memory issues.
|
||||
- If it is a kernel error, it is not easy to fix.
|
||||
- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9. https://github.com/sgl-project/sglang/blob/1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621/python/sglang/srt/server_args.py#L92-L102
|
||||
- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9.
|
||||
|
||||
## The server hangs
|
||||
If the server hangs, try disabling some optimizations when launching the server.
|
||||
- Add `--disable-cuda-graph`.
|
||||
- Add `--disable-flashinfer-sampling`.
|
||||
- Add `--sampling-backend pytorch`.
|
||||
4
scripts/ci_install_dependency.sh
Normal file
4
scripts/ci_install_dependency.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[all]"
|
||||
pip install transformers==4.45.2
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
Reference in New Issue
Block a user