diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6c536afc8..001cdd3f7 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -88,12 +88,37 @@ jobs: pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + - name: Benchmark Single Latency + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_latency.TestBenchLatency.test_default + + - name: Benchmark Online Latency + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default + - name: Benchmark Offline Throughput timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default + performance-test-1-gpu-part-2: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + - name: Benchmark Offline Throughput (w/o RadixAttention) timeout-minutes: 10 run: | @@ -112,31 +137,6 @@ jobs: cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend - performance-test-1-gpu-part-2: - if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install dependencies - run: | - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - - - name: Benchmark Single Latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_latency.TestBenchLatency.test_default - - - name: Benchmark Online Latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default - performance-test-2-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 2-gpu-runner diff --git a/python/sglang/README.md b/python/sglang/README.md index 481c69aff..78f469c74 100644 --- a/python/sglang/README.md +++ b/python/sglang/README.md @@ -7,5 +7,5 @@ - `bench_latency.py`: Benchmark a single static batch. - `bench_serving.py`: Benchmark online serving with dynamic requests. - `global_config.py`: The global configs and constants. -- `launch_server.py`: The entry point of launching local server. +- `launch_server.py`: The entry point for launching the local server. - `utils.py`: Common utilities. diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index d2275e5a2..a196b7676 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -69,7 +69,7 @@ class TestBenchServing(unittest.TestCase): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": assert res["median_e2e_latency_ms"] < 12000 - assert res["median_ttft_ms"] < 78 + assert res["median_ttft_ms"] < 80 assert res["median_itl_ms"] < 12 def test_moe_offline_throughput_default(self):