From 761b2cebd65ff7fbf2cd55b63e1230df1bf6f6ca Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 1 Sep 2024 02:36:56 -0700
Subject: [PATCH] [CI] merge all ci tests into one file (#1289)

---
 .github/workflows/accuracy-test.yml     |  74 ---------
 .github/workflows/e2e-test.yml          |  96 -----------
 .github/workflows/lint.yml              |  11 +-
 .github/workflows/pr-test.yml           | 201 ++++++++++++++++++++++++
 .github/workflows/unit-test.yml         |  56 -------
 python/sglang/README.md                 |   4 +-
 test/srt/test_moe_serving_throughput.py |   2 +-
 7 files changed, 211 insertions(+), 233 deletions(-)
 delete mode 100644 .github/workflows/accuracy-test.yml
 delete mode 100644 .github/workflows/e2e-test.yml
 create mode 100644 .github/workflows/pr-test.yml
 delete mode 100644 .github/workflows/unit-test.yml

diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml
deleted file mode 100644
index b7118e217..000000000
--- a/.github/workflows/accuracy-test.yml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: Accuracy Test
-
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  workflow_dispatch:
-
-concurrency:
-  group: accuracy-test-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  one-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 1-gpu-runner
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
-          git clone https://github.com/merrymercy/human-eval.git
-          cd human-eval
-          pip install -e .
-
-      - name: Evaluate Accuracy
-        timeout-minutes: 20
-        run: |
-          cd test/srt
-          python3 test_eval_accuracy_large.py
-
-  two-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 2-gpu-runner
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
-          git clone https://github.com/merrymercy/human-eval.git
-          cd human-eval
-          pip install -e .
-
-      - name: Evaluate Accuracy
-        timeout-minutes: 20
-        run: |
-          cd test/srt
-          python3 test_moe_eval_accuracy_large.py
-
-  finish:
-    needs: [one-gpu, two-gpu]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Finish
-        run: echo "This is an empty step to ensure that all jobs are completed."
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
deleted file mode 100644
index c5594ac4a..000000000
--- a/.github/workflows/e2e-test.yml
+++ /dev/null
@@ -1,96 +0,0 @@
-name: E2E Test
-
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  workflow_dispatch:
-
-concurrency:
-  group: e2e-test-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  one-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 1-gpu-runner
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
-      - name: Benchmark Serving Throughput
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
-
-      - name: Benchmark Serving Latency
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_serving_latency.TestServingLatency.test_default
-
-      - name: Benchmark Serving Throughput (w/o RadixAttention)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
-
-      - name: Benchmark Serving Throughput (w/o ChunkedPrefill)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
-
-  two-gpu:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 2-gpu-runner
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
-      - name: Benchmark Serving Throughput (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-
-      - name: Benchmark Serving Latency (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
-
-      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
-
-  finish:
-    needs: [one-gpu, two-gpu]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Finish
-        run: echo "This is an empty step to ensure that all jobs are completed."
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 076140506..4857f844f 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,19 +1,22 @@
 name: Lint
 
-on: [push, pull_request]
+on: [pull_request]
 
 jobs:
   lint:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.8
+
+      - name: Set up Python 3.9
         uses: actions/setup-python@v2
         with:
-          python-version: 3.8
+          python-version: 3.9
+
       - name: Install pre-commit hook
         run: |
           python -m pip install pre-commit
           pre-commit install
+
       - name: Linting
         run: pre-commit run --all-files
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
new file mode 100644
index 000000000..f8b50ad5d
--- /dev/null
+++ b/.github/workflows/pr-test.yml
@@ -0,0 +1,201 @@
+name: Pull Request Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-test-frontend:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[dev]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/lang
+          python3 run_suite.py --suite minimal
+
+  unit-test-backend-part-0:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[dev]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite minimal --range-begin 0 --range-end 8
+
+  unit-test-backend-part-1:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[dev]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite minimal --range-begin 8
+
+  performance-test-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Benchmark Serving Throughput
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
+
+      - name: Benchmark Serving Latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_serving_latency.TestServingLatency.test_default
+
+      - name: Benchmark Serving Throughput (w/o RadixAttention)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+
+      - name: Benchmark Serving Throughput (w/o ChunkedPrefill)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
+
+  performance-test-2-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Benchmark Serving Throughput (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+
+      - name: Benchmark Serving Latency (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
+
+      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+
+  accuracy-test-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate Accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_eval_accuracy_large.py
+
+  accuracy-test-2-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate Accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_moe_eval_accuracy_large.py
+
+  finish:
+    needs: [
+      unit-test-frontend, unit-test-backend-part-0, unit-test-backend-part-1,
+      performance-test-1-gpu, performance-test-2-gpu,
+      accuracy-test-1-gpu, accuracy-test-2-gpu
+    ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
deleted file mode 100644
index 5d774b67e..000000000
--- a/.github/workflows/unit-test.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Unit Test
-
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  workflow_dispatch:
-
-concurrency:
-  group: unit-test-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  run-test:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 1-gpu-runner
-    strategy:
-      matrix:
-        test_type: ['backend-0', 'backend-1', 'frontend']
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[dev]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
-      - name: Run test
-        timeout-minutes: 20
-        run: |
-          if [ "${{ matrix.test_type }}" = "frontend" ]; then
-            cd test/lang
-            python3 run_suite.py --suite minimal
-          elif [ "${{ matrix.test_type }}" = "backend-0" ]; then
-            cd test/srt
-            python3 run_suite.py --suite minimal --range-begin 0 --range-end 8
-          elif [ "${{ matrix.test_type }}" = "backend-1" ]; then
-            cd test/srt
-            python3 run_suite.py --suite minimal --range-begin 8
-          fi
-
-  finish:
-    needs: [run-test]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Finish
-        run: echo "This is an empty step to ensure that all jobs are completed."
diff --git a/python/sglang/README.md b/python/sglang/README.md
index c92144254..481c69aff 100644
--- a/python/sglang/README.md
+++ b/python/sglang/README.md
@@ -2,8 +2,8 @@
 
 - `lang`: The frontend language.
 - `srt`: The backend engine for running local models. (SRT = SGLang Runtime).
-- `test`: Test utilities.
-- `api.py`: Public API.
+- `test`: The test utilities.
+- `api.py`: The public APIs.
 - `bench_latency.py`: Benchmark a single static batch.
 - `bench_serving.py`: Benchmark online serving with dynamic requests.
 - `global_config.py`: The global configs and constants.
diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py
index 6f040da34..2acf626c1 100644
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -75,7 +75,7 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 1850
+            assert res["output_throughput"] > 1800
 
     def test_default_without_radix_cache(self):
         res = self.run_test(