[CI] merge all ci tests into one file (#1289)

2024-09-01 02:36:56 -07:00
parent 54772f784a
commit 761b2cebd6
7 changed files with 211 additions and 233 deletions
--- a/.github/workflows/accuracy-test.yml
+++ b/.github/workflows/accuracy-test.yml
@@ -1,74 +0,0 @@
 name: Accuracy Test
 on:
  push:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  pull_request:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  workflow_dispatch:
 concurrency:
  group: accuracy-test-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
          pip install -e .
      - name: Evaluate Accuracy
        timeout-minutes: 20
        run: |
          cd test/srt
          python3 test_eval_accuracy_large.py
  two-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
          pip install -e .
      - name: Evaluate Accuracy
        timeout-minutes: 20
        run: |
          cd test/srt
          python3 test_moe_eval_accuracy_large.py
  finish:
    needs: [one-gpu, two-gpu]
    runs-on: ubuntu-latest
    steps:
      - name: Finish
        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -1,96 +0,0 @@
 name: E2E Test
 on:
  push:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  pull_request:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  workflow_dispatch:
 concurrency:
  group: e2e-test-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Serving Throughput
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
      - name: Benchmark Serving Latency
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_latency.TestServingLatency.test_default
      - name: Benchmark Serving Throughput (w/o RadixAttention)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
      - name: Benchmark Serving Throughput (w/o ChunkedPrefill)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
  two-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Serving Throughput (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
      - name: Benchmark Serving Latency (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
  finish:
    needs: [one-gpu, two-gpu]
    runs-on: ubuntu-latest
    steps:
      - name: Finish
        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,19 +1,22 @@
 name: Lint
-on: [push, pull_request]
+on: [pull_request]
 jobs:
  lint:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
-      - name: Set up Python 3.8
+
      - name: Set up Python 3.9
        uses: actions/setup-python@v2
        with:
-          python-version: 3.8
+          python-version: 3.9
      - name: Install pre-commit hook
        run: |
          python -m pip install pre-commit
          pre-commit install
      - name: Linting
        run: pre-commit run --all-files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -0,0 +1,201 @@
 name: Pull Request Test
 on:
  push:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  pull_request:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  workflow_dispatch:
 concurrency:
  group: pr-test-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  unit-test-frontend:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[dev]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
        run: |
          cd test/lang
          python3 run_suite.py --suite minimal
  unit-test-backend-part-0:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[dev]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
        run: |
          cd test/srt
          python3 run_suite.py --suite minimal --range-begin 0 --range-end 8
  unit-test-backend-part-1:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[dev]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
        run: |
          cd test/srt
          python3 run_suite.py --suite minimal --range-begin 8
  performance-test-1-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Serving Throughput
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
      - name: Benchmark Serving Latency
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_latency.TestServingLatency.test_default
      - name: Benchmark Serving Throughput (w/o RadixAttention)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
      - name: Benchmark Serving Throughput (w/o ChunkedPrefill)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
  performance-test-2-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Serving Throughput (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
      - name: Benchmark Serving Latency (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
  accuracy-test-1-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
          pip install -e .
      - name: Evaluate Accuracy
        timeout-minutes: 20
        run: |
          cd test/srt
          python3 test_eval_accuracy_large.py
  accuracy-test-2-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
          pip install -e .
      - name: Evaluate Accuracy
        timeout-minutes: 20
        run: |
          cd test/srt
          python3 test_moe_eval_accuracy_large.py
  finish:
    needs: [
      unit-test-frontend, unit-test-backend-part-0, unit-test-backend-part-1,
      performance-test-1-gpu, performance-test-2-gpu,
      accuracy-test-1-gpu, accuracy-test-2-gpu
    ]
    runs-on: ubuntu-latest
    steps:
      - name: Finish
        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -1,56 +0,0 @@
 name: Unit Test
 on:
  push:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  pull_request:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  workflow_dispatch:
 concurrency:
  group: unit-test-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run-test:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    strategy:
      matrix:
        test_type: ['backend-0', 'backend-1', 'frontend']
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[dev]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Run test
        timeout-minutes: 20
        run: |
          if [ "${{ matrix.test_type }}" = "frontend" ]; then
            cd test/lang
            python3 run_suite.py --suite minimal
          elif [ "${{ matrix.test_type }}" = "backend-0" ]; then
            cd test/srt
            python3 run_suite.py --suite minimal --range-begin 0 --range-end 8
          elif [ "${{ matrix.test_type }}" = "backend-1" ]; then
            cd test/srt
            python3 run_suite.py --suite minimal --range-begin 8
          fi
  finish:
    needs: [run-test]
    runs-on: ubuntu-latest
    steps:
      - name: Finish
        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/python/sglang/README.md
+++ b/python/sglang/README.md
@@ -2,8 +2,8 @@
 - `lang`: The frontend language.
 - `srt`: The backend engine for running local models. (SRT = SGLang Runtime).
- `test`: Test utilities.
+- `test`: The test utilities.
- `api.py`: Public API.
+- `api.py`: The public APIs.
 - `bench_latency.py`: Benchmark a single static batch.
 - `bench_serving.py`: Benchmark online serving with dynamic requests.
 - `global_config.py`: The global configs and constants.
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -75,7 +75,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 1850
+            assert res["output_throughput"] > 1800
    def test_default_without_radix_cache(self):
        res = self.run_test(