Update ci workflows (#1804)

2024-10-26 04:32:36 -07:00
parent c26507484f
commit 6aa94b967c
34 changed files with 135 additions and 140 deletions
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -1,20 +1,15 @@
 name: Build Documentation
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
  workflow_dispatch:
 jobs:
  execute-notebooks:
    runs-on: 1-gpu-runner
-    if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    steps:
-      - uses: actions/checkout@v3
+      - name: Checkout code
-        with:
+        uses: actions/checkout@v3
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v4
@@ -23,22 +18,14 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install -r docs/requirements.txt
          pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Setup Jupyter Kernel
        run: |
          python -m ipykernel install --user --name python3 --display-name "Python 3"
      - name: Execute notebooks
        env:
          HF_HOME: /hf_home
          SGLANG_IS_IN_CI: true
          CUDA_VISIBLE_DEVICES: 0
        run: |
          cd docs/en
          for nb in *.ipynb; do
@@ -54,34 +41,18 @@ jobs:
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    runs-on: 1-gpu-runner
    steps:
-      - uses: actions/checkout@v3
+      - name: Checkout code
-        with:
+        uses: actions/checkout@v3
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.9'
      - name: Cache Python dependencies
        uses: actions/cache@v3
        with:
          path: ~/.cache/pip
          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
          restore-keys: |
            ${{ runner.os }}-pip-
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install -r docs/requirements.txt
          pip install nbconvert jupyter_client ipykernel ipywidgets matplotlib
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Install Pandoc
        run: |
          apt-get update
          apt-get install -y pandoc
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -8,10 +8,10 @@ jobs:
    steps:
      - uses: actions/checkout@v2
-      - name: Set up Python 3.9
+      - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: '3.9'
      - name: Install pre-commit hook
        run: |
--- a/.github/workflows/nightly-eval.yml
+++ b/.github/workflows/nightly-eval.yml
@@ -24,9 +24,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Nightly gsm8k Accuracy
        timeout-minutes: 60
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -27,10 +27,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[dev]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 10
@@ -47,10 +44,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[dev]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -67,10 +61,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[dev]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -87,10 +78,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[dev]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -107,10 +95,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[dev]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
@@ -127,10 +112,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Single Latency
        timeout-minutes: 10
@@ -165,10 +147,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Offline Throughput (w/o RadixAttention)
        timeout-minutes: 10
@@ -197,10 +176,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Offline Throughput (TP=2)
        timeout-minutes: 10
@@ -229,10 +205,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
@@ -253,10 +226,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          bash scripts/ci_install_dependency.sh
          pip install -e "python[all]"
          pip install transformers==4.45.2
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
--- a/.github/workflows/release-github.yml
+++ b/.github/workflows/release-github.yml
@@ -1,25 +0,0 @@
 name: Release GitHub
 on:
  workflow_dispatch:
 jobs:
  publish:
    if: github.repository == 'sgl-project/sglang'
    runs-on: ubuntu-latest
    environment: 'prod'
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
      - name: Get version
        id: get_version
        run: |
          version=$(cat python/sglang/version.py | cut -d'"' -f2)
          echo "TAG=v$version" >> $GITHUB_OUTPUT
      - name: Release
        uses: softprops/action-gh-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.REPO_TOKEN }}
        with:
          name: Release ${{ steps.get_version.outputs.TAG }}
          tag_name: ${{ steps.get_version.outputs.TAG }}
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -13,12 +13,14 @@ jobs:
    runs-on: ubuntu-latest
    environment: 'prod'
    steps:
-      - name: Set up python3.8
+      - name: Set up Python
        uses: actions/setup-python@v4
        with:
-          python-version: '3.8'
+          python-version: '3.9'
      - name: Checkout repository
        uses: actions/checkout@v3
      - name: Upload to pypi
        run: |
          cd python
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,23 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 clean:
 	rm -rf $(BUILDDIR)/*
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,33 @@
 # SGLang Documentation
 ## Build the documentation website
 ### Dependency
 ```
 pip install -r requirements.txt
 ```
 ### Build
 ```
 make html
 ```
 ### Clean
 To remove all generated files:
 ```
 make clean
 ```
 ### Serve (preview)
 Run an HTTP server and visit http://localhost:8000 in your browser.
 ```
 python3 -m http.server --d _build/html
 ```
 ### Deploy
 Clone [sgl-project.github.io](https://github.com/sgl-project/sgl-project.github.io) and make sure you have write access.
 ```bash
 export DOC_SITE_PATH=../../sgl-project.github.io   # update this with your path
 python3 deploy.py
 ```
--- a/docs/en/_static/css/readthedocs.css
+++ b/docs/en/_static/css/readthedocs.css
--- a/docs/en/_static/image/logo.png
+++ b/docs/en/_static/image/logo.png
--- a/docs/en/backend.md
+++ b/docs/en/backend.md
@@ -20,7 +20,7 @@ curl http://localhost:30000/generate \
  }'
 ```
-Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
+Learn more about the argument specification, streaming, and multi-modal support [here](https://sgl-project.github.io/sampling_params.html).
 ### OpenAI Compatible API
 In addition, the server supports OpenAI-compatible APIs.
@@ -74,7 +74,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
 ```
- See [hyperparameter tuning](https://sglang.readthedocs.io/en/latest/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
+- See [hyperparameter tuning](https://sgl-project.github.io/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
 - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
@@ -83,7 +83,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
+- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sgl-project.github.io/custom_chat_template.html).
 - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
 ```
 # Node 0
@@ -158,7 +158,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
 - gte-Qwen2
  - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
-Instructions for supporting a new model are [here](https://sglang.readthedocs.io/en/latest/model_support.html).
+Instructions for supporting a new model are [here](https://sgl-project.github.io/model_support.html).
 #### Use Models From ModelScope
 <details>
--- a/docs/en/benchmark_and_profiling.md
+++ b/docs/en/benchmark_and_profiling.md
--- a/docs/en/choices_methods.md
+++ b/docs/en/choices_methods.md
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -3,7 +3,7 @@ import sys
 sys.path.insert(0, os.path.abspath("../.."))
-version_file = "../../python/sglang/version.py"
+version_file = "../python/sglang/version.py"
 with open(version_file, "r") as f:
    exec(compile(f.read(), version_file, "exec"))
 __version__ = locals()["__version__"]
--- a/docs/en/contributor_guide.md
+++ b/docs/en/contributor_guide.md
@@ -11,4 +11,4 @@ pre-commit run --all-files
 ```
 ## Add Unit Tests
-Add unit tests under [sglang/test](../../test). You can learn how to add and run tests from the README.md in that folder.
+Add unit tests under [sglang/test](https://github.com/sgl-project/sglang/tree/main/test). You can learn how to add and run tests from the README.md in that folder.
--- a/docs/en/custom_chat_template.md
+++ b/docs/en/custom_chat_template.md
@@ -1,6 +1,6 @@
 # Custom Chat Template in SGLang Runtime
-**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](../../python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](../../python/sglang/lang/chat_template.py)).
+**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
 By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
 It should just work for most official models such as Llama-2/Llama-3.
--- a/docs/deploy.py
+++ b/docs/deploy.py
@@ -0,0 +1,22 @@
 #!/usr/bin/python3
 import os
 from datetime import datetime
 def run_cmd(cmd):
    print(cmd)
    os.system(cmd)
 run_cmd("cd $DOC_SITE_PATH; git pull")
 # (Optional) Remove old files
 # run_cmd("rm -rf $ALPA_SITE_PATH/*")
 run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
 cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
 run_cmd(
    f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
 )
--- a/docs/deploy_docs.sh
+++ b/docs/deploy_docs.sh
--- a/docs/en/embedding_model.ipynb
+++ b/docs/en/embedding_model.ipynb
--- a/docs/en/Makefile
+++ b/docs/en/Makefile
@@ -1,12 +0,0 @@
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/en/frontend.md
+++ b/docs/en/frontend.md
--- a/docs/en/hyperparameter_tuning.md
+++ b/docs/en/hyperparameter_tuning.md
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -10,24 +10,28 @@ The core features include:
 - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 .. toctree::
   :maxdepth: 1
   :caption: Getting Started
   install.md
-   send_request.ipynb
+
 .. toctree::
   :maxdepth: 1
   :caption: Backend Tutorial
   backend.md
 .. toctree::
   :maxdepth: 1
   :caption: Frontend Tutorial
   frontend.md
 .. toctree::
   :maxdepth: 1
   :caption: References
@@ -39,4 +43,3 @@ The core features include:
   choices_methods.md
   benchmark_and_profiling.md
   troubleshooting.md
   embedding_model.ipynb
--- a/docs/en/install.md
+++ b/docs/en/install.md
@@ -48,9 +48,9 @@ docker run --gpus all \
 <summary>More</summary>
 > This method is recommended if you plan to serve it as a service.
-> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
+> A better approach is to use the [k8s-sglang-service.yaml](https://github.com/sgl-project/sglang/blob/main/docker/k8s-sglang-service.yaml).
-1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
+1. Copy the [compose.yml](https://github.com/sgl-project/sglang/blob/main/docker/compose.yaml) to your local machine
 2. Execute the command `docker compose up -d` in your terminal.
 </details>
--- a/docs/en/model_support.md
+++ b/docs/en/model_support.md
--- a/docs/en/release_process.md
+++ b/docs/en/release_process.md
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,12 +1,17 @@
 ipykernel
 ipywidgets
 jupyter_client
 markdown>=3.4.0
 matplotlib
 myst-parser
 nbconvert
 nbsphinx
 pandoc
 pillow
 pydantic
 sphinx
 sphinx-book-theme
 sphinx-copybutton
 sphinx-tabs
 sphinxcontrib-mermaid
-pillow
+urllib3<2.0.0
 pydantic
 urllib3<2.0.0
 nbsphinx
 pandoc
--- a/docs/en/sampling_params.md
+++ b/docs/en/sampling_params.md
@@ -194,7 +194,7 @@ Since we compute penalty algorithms through CUDA, the logic stores relevant para
 You can run your own benchmark with desired parameters on your own hardware to make sure it's not OOMing before using.
-Tuning `--mem-fraction-static` and/or `--max-running-requests` will help. See [here](hyperparameter_tuning.md#minor-tune---max-prefill-tokens---mem-fraction-static---max-running-requests) for more information.
+Tuning `--mem-fraction-static` and/or `--max-running-requests` will help.
 ### Benchmarks
--- a/docs/en/send_request.ipynb
+++ b/docs/en/send_request.ipynb
--- a/docs/serve.sh
+++ b/docs/serve.sh
@@ -0,0 +1 @@
 python3 -m http.server --d _build/html
--- a/docs/en/setup_github_runner.md
+++ b/docs/en/setup_github_runner.md
--- a/docs/en/troubleshooting.md
+++ b/docs/en/troubleshooting.md
@@ -5,9 +5,9 @@ This page lists some common errors and tips for fixing them.
 ## CUDA error: an illegal memory access was encountered
 This error may be due to kernel errors or out-of-memory issues.
 - If it is a kernel error, it is not easy to fix.
- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9. https://github.com/sgl-project/sglang/blob/1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621/python/sglang/srt/server_args.py#L92-L102
+- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9.
 ## The server hangs
 If the server hangs, try disabling some optimizations when launching the server.
 - Add `--disable-cuda-graph`.
- Add `--disable-flashinfer-sampling`.
+- Add `--sampling-backend pytorch`.
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -0,0 +1,4 @@
 pip install --upgrade pip
 pip install -e "python[all]"
 pip install transformers==4.45.2
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
--- a/scripts/killall_sglang.sh
+++ b/scripts/killall_sglang.sh