From 1b5d56f7f885cdc4284579ee863f9944f4c12bce Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 1 Sep 2024 00:27:25 -0700
Subject: [PATCH] [CI] Add more multi-gpu tests (#1280)

---
 .github/workflows/accuracy-test.yml      | 33 ++++++++++-
 .github/workflows/cache-purge.yml        | 27 ---------
 .github/workflows/e2e-test.yml           | 44 +++++++++++++-
 .github/workflows/moe-test.yml           | 45 ---------------
 .github/workflows/unit-test.yml          | 10 ++--
 python/sglang/bench_latency.py           | 41 +++++++------
 test/srt/test_moe_eval_accuracy_large.py | 73 ++++++++++++++++++++++++
 test/srt/test_moe_serving_latency.py     | 45 +++++++++++++++
 test/srt/test_moe_serving_throughput.py  | 19 +-----
 test/srt/test_serving_latency.py         | 43 ++++++++++++++
 test/srt/test_serving_throughput.py      | 19 +-----
 11 files changed, 271 insertions(+), 128 deletions(-)
 delete mode 100644 .github/workflows/cache-purge.yml
 delete mode 100644 .github/workflows/moe-test.yml
 create mode 100644 test/srt/test_moe_eval_accuracy_large.py
 create mode 100644 test/srt/test_moe_serving_latency.py
 create mode 100644 test/srt/test_serving_latency.py

diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml
index 6fb102a4c..b7118e217 100644
--- a/.github/workflows/accuracy-test.yml
+++ b/.github/workflows/accuracy-test.yml
@@ -18,7 +18,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  accuracy-test:
+  one-gpu:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner
 
@@ -41,3 +41,34 @@ jobs:
         run: |
           cd test/srt
           python3 test_eval_accuracy_large.py
+
+  two-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate Accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_moe_eval_accuracy_large.py
+
+  finish:
+    needs: [one-gpu, two-gpu]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
diff --git a/.github/workflows/cache-purge.yml b/.github/workflows/cache-purge.yml
deleted file mode 100644
index c699f4988..000000000
--- a/.github/workflows/cache-purge.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: Weekly Cache Purge
-
-on:
-  schedule:
-    - cron: '0 0 * * 0' # Every Sunday at 00:00
-  workflow_dispatch:
-
-jobs:
-  purge-cache:
-    if: github.repository == 'sgl-project/sglang'
-    runs-on: self-hosted
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Purge pip cache
-      run: |
-        source $HOME/venv/bin/activate
-        echo "$HOME/venv/bin" >> $GITHUB_PATH
-        pip cache purge
-
-    - name: Update dependencies
-      run: |
-        pip install --upgrade pip
-        pip install -e "python[all]"
-        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
index 11c94775c..c5594ac4a 100644
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -18,7 +18,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  e2e-test:
+  one-gpu:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner
 
@@ -41,7 +41,8 @@ jobs:
       - name: Benchmark Serving Latency
         timeout-minutes: 10
         run: |
-          python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8
+          cd test/srt
+          python3 -m unittest test_serving_latency.TestServingLatency.test_default
 
       - name: Benchmark Serving Throughput (w/o RadixAttention)
         timeout-minutes: 10
@@ -54,3 +55,42 @@ jobs:
         run: |
           cd test/srt
           python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
+
+  two-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Benchmark Serving Throughput (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+
+      - name: Benchmark Serving Latency (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
+
+      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+
+  finish:
+    needs: [one-gpu, two-gpu]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml
deleted file mode 100644
index 4440aa215..000000000
--- a/.github/workflows/moe-test.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: MoE Test
-
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  workflow_dispatch:
-
-concurrency:
-  group: moe-test-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  moe-test:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 2-gpu-runner
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
-      - name: Benchmark MoE Serving Throughput
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-
-      - name: Benchmark MoE Serving Throughput (w/o RadixAttention)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 41a565a63..5d774b67e 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -18,7 +18,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  unit-test-jobs:
+  run-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner
     strategy:
@@ -48,9 +48,9 @@ jobs:
             python3 run_suite.py --suite minimal --range-begin 8
           fi
 
-  unit-test:
-    needs: unit-test-jobs
+  finish:
+    needs: [run-test]
     runs-on: ubuntu-latest
     steps:
-      - name: Merge step
-        run: echo "This is an empty merge step"
\ No newline at end of file
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py
index 966a97d20..9006b7150 100644
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ## plot the results in series of lines:
 python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
 
-
 # Usage (correctness test):
 python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
 
 ## Reference output (of the correctness test above, can be gpu dependent):
-prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
-        [-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
-        [ -9.1875, -10.2500,   2.7109,  ...,  -4.3359,  -4.0664,  -4.1328]],
-       device='cuda:0', dtype=torch.float16)
-prefill logits (final) tensor([[-8.3203, -7.1211,  3.3379,  ..., -4.9570, -4.1328, -3.4141],
-        [-8.9062, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0742],
-        [-9.6328, -9.0547,  4.0117,  ..., -5.3047, -4.7148, -4.4609]],
-       device='cuda:0', dtype=torch.float16)
-<s> The capital of France is.
+input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
+
+prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
+       device='cuda:0')
+
+prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
+        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
+        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
+       device='cuda:0')
+
+========== Prompt 0 ==========
+<s> The capital of France is Paris.
 The capital of the United States is Washington, D.C.
 
-<s> The capital of the United Kindom is.
+
+========== Prompt 1 ==========
+<s> The capital of the United Kindom is London.
 The capital of the United Kingdom is London.
 The capital of the
-<s> Today is a sunny day and I like go for a walk in the park.
+
+========== Prompt 2 ==========
+<s> Today is a sunny day and I like to go for a walk in the park.
 I'm going to the park
 """
 
@@ -225,12 +233,12 @@ def correctness_test(
 
     # Prepare inputs
     input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
-    rank_print(f"{input_ids=}")
+    rank_print(f"\n{input_ids=}\n")
 
     if bench_args.cut_len > 0:
         # Prefill
         next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-        rank_print("prefill logits (first half)", next_token_logits)
+        rank_print(f"prefill logits (first half): {next_token_logits} \n")
 
     # Prepare extend inputs
     reqs = prepare_extend_inputs_for_correctness_test(
@@ -239,7 +247,7 @@ def correctness_test(
 
     # Extend
     next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-    rank_print("prefill logits (final)", next_token_logits)
+    rank_print(f"prefill logits (final): {next_token_logits} \n")
 
     # Decode
     output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
@@ -250,7 +258,8 @@ def correctness_test(
 
     # Print
     for i in range(len(reqs)):
-        rank_print(tokenizer.decode(output_ids[i]))
+        rank_print(f"========== Prompt {i} ==========")
+        rank_print(tokenizer.decode(output_ids[i]), "\n")
 
 
 @torch.inference_mode()
diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/srt/test_moe_eval_accuracy_large.py
new file mode 100644
index 000000000..d13f427d8
--- /dev/null
+++ b/test/srt/test_moe_eval_accuracy_large.py
@@ -0,0 +1,73 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestEvalAccuracyLarge(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--log-level-http",
+                "warning",
+                "--tp",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=3000,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.63, f"{metrics}"
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.43, f"{metrics}"
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.64, f"{metrics}"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_moe_serving_latency.py b/test/srt/test_moe_serving_latency.py
new file mode 100644
index 000000000..9d5215323
--- /dev/null
+++ b/test/srt/test_moe_serving_latency.py
@@ -0,0 +1,45 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
+
+
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+            "--tp",
+            "2",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 125
+
+        kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py
index 4f6e8db82..6f040da34 100644
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
             other_args.append("--disable-flashinfer")
         other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
         other_args.extend(["--tensor-parallel-size", "2"])
-        other_args.append("--enable-p2p-check")
 
         model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
         base_url = DEFAULT_URL_FOR_TEST
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         # Run benchmark
-        num_prompts = 200
+        num_prompts = 300
         args = SimpleNamespace(
             backend="sglang",
             base_url=base_url,
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1800
-            assert res["output_throughput"] > 1750
+            assert res["output_throughput"] > 1850
 
     def test_default_without_radix_cache(self):
         res = self.run_test(
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1900
-            assert res["output_throughput"] > 1850
-
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
+            assert res["output_throughput"] > 1950
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_serving_latency.py b/test/srt/test_serving_latency.py
new file mode 100644
index 000000000..e762892c8
--- /dev/null
+++ b/test/srt/test_serving_latency.py
@@ -0,0 +1,43 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
+
+
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 130
+
+        kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py
index f1089a6a7..d4ed12612 100644
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         # Run benchmark
-        num_prompts = 400
+        num_prompts = 500
         args = SimpleNamespace(
             backend="sglang",
             base_url=base_url,
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
-            assert res["output_throughput"] > 2500
+            assert res["output_throughput"] > 2400
 
     def test_default_without_radix_cache(self):
         res = self.run_test(
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1500, H100 (SMX): 2850
             assert res["output_throughput"] > 2800
 
     def test_default_without_chunked_prefill(self):
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
-            assert res["output_throughput"] > 2500
-
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
+            assert res["output_throughput"] > 2400
 
 
 if __name__ == "__main__":