[CI] Add more multi-gpu tests (#1280)

2024-09-01 00:27:25 -07:00
parent d134c139a1
commit 1b5d56f7f8
11 changed files with 271 additions and 128 deletions
--- a/.github/workflows/accuracy-test.yml
+++ b/.github/workflows/accuracy-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  accuracy-test:
+  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
@@ -41,3 +41,34 @@ jobs:
        run: |
          cd test/srt
          python3 test_eval_accuracy_large.py
  two-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
          pip install -e .
      - name: Evaluate Accuracy
        timeout-minutes: 20
        run: |
          cd test/srt
          python3 test_moe_eval_accuracy_large.py
  finish:
    needs: [one-gpu, two-gpu]
    runs-on: ubuntu-latest
    steps:
      - name: Finish
        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/cache-purge.yml
+++ b/.github/workflows/cache-purge.yml
@@ -1,27 +0,0 @@
 name: Weekly Cache Purge
 on:
  schedule:
    - cron: '0 0 * * 0' # Every Sunday at 00:00
  workflow_dispatch:
 jobs:
  purge-cache:
    if: github.repository == 'sgl-project/sglang'
    runs-on: self-hosted
    steps:
    - name: Checkout code
      uses: actions/checkout@v3
    - name: Purge pip cache
      run: |
        source $HOME/venv/bin/activate
        echo "$HOME/venv/bin" >> $GITHUB_PATH
        pip cache purge
    - name: Update dependencies
      run: |
        pip install --upgrade pip
        pip install -e "python[all]"
        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  e2e-test:
+  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
@@ -41,7 +41,8 @@ jobs:
      - name: Benchmark Serving Latency
        timeout-minutes: 10
        run: |
-          python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8
+          cd test/srt
          python3 -m unittest test_serving_latency.TestServingLatency.test_default
      - name: Benchmark Serving Throughput (w/o RadixAttention)
        timeout-minutes: 10
@@ -54,3 +55,42 @@ jobs:
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
  two-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark Serving Throughput (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
      - name: Benchmark Serving Latency (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
  finish:
    needs: [one-gpu, two-gpu]
    runs-on: ubuntu-latest
    steps:
      - name: Finish
        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/moe-test.yml
+++ b/.github/workflows/moe-test.yml
@@ -1,45 +0,0 @@
 name: MoE Test
 on:
  push:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  pull_request:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  workflow_dispatch:
 concurrency:
  group: moe-test-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  moe-test:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
      - name: Benchmark MoE Serving Throughput
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
      - name: Benchmark MoE Serving Throughput (w/o RadixAttention)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  unit-test-jobs:
+  run-test:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    strategy:
@@ -48,9 +48,9 @@ jobs:
            python3 run_suite.py --suite minimal --range-begin 8
          fi
-  unit-test:
+  finish:
-    needs: unit-test-jobs
+    needs: [run-test]
    runs-on: ubuntu-latest
    steps:
-      - name: Merge step
+      - name: Finish
-        run: echo "This is an empty merge step"
+        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ## plot the results in series of lines:
 python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
 # Usage (correctness test):
 python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
 ## Reference output (of the correctness test above, can be gpu dependent):
-prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
+input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
-        [-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
+
-        [ -9.1875, -10.2500,   2.7109,  ...,  -4.3359,  -4.0664,  -4.1328]],
+prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
-       device='cuda:0', dtype=torch.float16)
+        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
-prefill logits (final) tensor([[-8.3203, -7.1211,  3.3379,  ..., -4.9570, -4.1328, -3.4141],
+        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
-        [-8.9062, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0742],
+       device='cuda:0')
-        [-9.6328, -9.0547,  4.0117,  ..., -5.3047, -4.7148, -4.4609]],
+
-       device='cuda:0', dtype=torch.float16)
+prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
-<s> The capital of France is.
+        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
       device='cuda:0')
 ========== Prompt 0 ==========
 <s> The capital of France is Paris.
 The capital of the United States is Washington, D.C.
-<s> The capital of the United Kindom is.
+
 ========== Prompt 1 ==========
 <s> The capital of the United Kindom is London.
 The capital of the United Kingdom is London.
 The capital of the
-<s> Today is a sunny day and I like go for a walk in the park.
+
 ========== Prompt 2 ==========
 <s> Today is a sunny day and I like to go for a walk in the park.
 I'm going to the park
 """
@@ -225,12 +233,12 @@ def correctness_test(
    # Prepare inputs
    input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
-    rank_print(f"{input_ids=}")
+    rank_print(f"\n{input_ids=}\n")
    if bench_args.cut_len > 0:
        # Prefill
        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-        rank_print("prefill logits (first half)", next_token_logits)
+        rank_print(f"prefill logits (first half): {next_token_logits} \n")
    # Prepare extend inputs
    reqs = prepare_extend_inputs_for_correctness_test(
@@ -239,7 +247,7 @@ def correctness_test(
    # Extend
    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-    rank_print("prefill logits (final)", next_token_logits)
+    rank_print(f"prefill logits (final): {next_token_logits} \n")
    # Decode
    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
@@ -250,7 +258,8 @@ def correctness_test(
    # Print
    for i in range(len(reqs)):
-        rank_print(tokenizer.decode(output_ids[i]))
+        rank_print(f"========== Prompt {i} ==========")
        rank_print(tokenizer.decode(output_ids[i]), "\n")
@torch.inference_mode()
--- a/test/srt/test_moe_eval_accuracy_large.py
+++ b/test/srt/test_moe_eval_accuracy_large.py
@@ -0,0 +1,73 @@
 import unittest
 from types import SimpleNamespace
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestEvalAccuracyLarge(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--log-level-http",
                "warning",
                "--tp",
                "2",
            ],
        )
    @classmethod
    def tearDownClass(cls):
        kill_child_process(cls.process.pid)
    def test_mmlu(self):
        args = SimpleNamespace(
            base_url=self.base_url,
            model=self.model,
            eval_name="mmlu",
            num_examples=3000,
            num_threads=1024,
        )
        metrics = run_eval(args)
        assert metrics["score"] >= 0.63, f"{metrics}"
    def test_human_eval(self):
        args = SimpleNamespace(
            base_url=self.base_url,
            model=self.model,
            eval_name="humaneval",
            num_examples=None,
            num_threads=1024,
        )
        metrics = run_eval(args)
        assert metrics["score"] >= 0.43, f"{metrics}"
    def test_mgsm_en(self):
        args = SimpleNamespace(
            base_url=self.base_url,
            model=self.model,
            eval_name="mgsm_en",
            num_examples=None,
            num_threads=1024,
        )
        metrics = run_eval(args)
        assert metrics["score"] >= 0.64, f"{metrics}"
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_moe_serving_latency.py
+++ b/test/srt/test_moe_serving_latency.py
@@ -0,0 +1,45 @@
 import os
 import subprocess
 import unittest
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
 class TestServingLatency(unittest.TestCase):
    def test_default(self):
        command = [
            "python3",
            "-m",
            "sglang.bench_latency",
            "--model",
            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
            "--batch-size",
            "1",
            "--input",
            "128",
            "--output",
            "8",
            "--tp",
            "2",
        ]
        process = subprocess.Popen(
            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        stdout, stderr = process.communicate()
        output = stdout.decode()
        error = stderr.decode()
        print(f"Output: {output}")
        print(f"Error: {error}")
        lastline = output.split("\n")[-3]
        value = float(lastline.split(" ")[-2])
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            assert value > 125
        kill_child_process(process.pid)
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
            other_args.append("--disable-flashinfer")
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
        other_args.extend(["--tensor-parallel-size", "2"])
        other_args.append("--enable-p2p-check")
        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
        base_url = DEFAULT_URL_FOR_TEST
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        # Run benchmark
-        num_prompts = 200
+        num_prompts = 300
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1800
+            assert res["output_throughput"] > 1850
            assert res["output_throughput"] > 1750
    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1900
+            assert res["output_throughput"] > 1950
            assert res["output_throughput"] > 1850
    def test_all_cases(self):
        for disable_radix_cache in [False, True]:
            for disable_flashinfer in [False, True]:
                for chunked_prefill_size in [-1, 2048]:
                    self.run_test(
                        disable_radix_cache=False,
                        disable_flashinfer=False,
                        chunked_prefill_size=-1,
                    )
 if __name__ == "__main__":
--- a/test/srt/test_serving_latency.py
+++ b/test/srt/test_serving_latency.py
@@ -0,0 +1,43 @@
 import os
 import subprocess
 import unittest
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
 class TestServingLatency(unittest.TestCase):
    def test_default(self):
        command = [
            "python3",
            "-m",
            "sglang.bench_latency",
            "--model",
            DEFAULT_MODEL_NAME_FOR_TEST,
            "--batch-size",
            "1",
            "--input",
            "128",
            "--output",
            "8",
        ]
        process = subprocess.Popen(
            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        stdout, stderr = process.communicate()
        output = stdout.decode()
        error = stderr.decode()
        print(f"Output: {output}")
        print(f"Error: {error}")
        lastline = output.split("\n")[-3]
        value = float(lastline.split(" ")[-2])
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            assert value > 130
        kill_child_process(process.pid)
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        # Run benchmark
-        num_prompts = 400
+        num_prompts = 500
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
+            assert res["output_throughput"] > 2400
            assert res["output_throughput"] > 2500
    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE): 1500, H100 (SMX): 2850
            assert res["output_throughput"] > 2800
    def test_default_without_chunked_prefill(self):
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
+            assert res["output_throughput"] > 2400
            assert res["output_throughput"] > 2500
    def test_all_cases(self):
        for disable_radix_cache in [False, True]:
            for disable_flashinfer in [False, True]:
                for chunked_prefill_size in [-1, 2048]:
                    self.run_test(
                        disable_radix_cache=False,
                        disable_flashinfer=False,
                        chunked_prefill_size=-1,
                    )
 if __name__ == "__main__":