[CI] Add more multi-gpu tests (#1280)

2024-09-01 00:27:25 -07:00
parent d134c139a1
commit 1b5d56f7f8
11 changed files with 271 additions and 128 deletions
--- a/.github/workflows/accuracy-test.yml
+++ b/.github/workflows/accuracy-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  accuracy-test:
+  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner

@@ -41,3 +41,34 @@ jobs:
        run: |
          cd test/srt
          python3 test_eval_accuracy_large.py
+
+  two-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate Accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_moe_eval_accuracy_large.py
+
+  finish:
+    needs: [one-gpu, two-gpu]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/cache-purge.yml
+++ b/.github/workflows/cache-purge.yml
@@ -1,27 +0,0 @@
-name: Weekly Cache Purge
-
-on:
-  schedule:
-    - cron: '0 0 * * 0' # Every Sunday at 00:00
-  workflow_dispatch:
-
-jobs:
-  purge-cache:
-    if: github.repository == 'sgl-project/sglang'
-    runs-on: self-hosted
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Purge pip cache
-      run: |
-        source $HOME/venv/bin/activate
-        echo "$HOME/venv/bin" >> $GITHUB_PATH
-        pip cache purge
-
-    - name: Update dependencies
-      run: |
-        pip install --upgrade pip
-        pip install -e "python[all]"
-        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  e2e-test:
+  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner

@@ -41,7 +41,8 @@ jobs:
      - name: Benchmark Serving Latency
        timeout-minutes: 10
        run: |
-          python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8
+          cd test/srt
+          python3 -m unittest test_serving_latency.TestServingLatency.test_default

      - name: Benchmark Serving Throughput (w/o RadixAttention)
        timeout-minutes: 10
@@ -54,3 +55,42 @@ jobs:
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
+
+  two-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Benchmark Serving Throughput (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+
+      - name: Benchmark Serving Latency (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
+
+      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+
+  finish:
+    needs: [one-gpu, two-gpu]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/moe-test.yml
+++ b/.github/workflows/moe-test.yml
@@ -1,45 +0,0 @@
-name: MoE Test
-
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  workflow_dispatch:
-
-concurrency:
-  group: moe-test-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  moe-test:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 2-gpu-runner
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
-      - name: Benchmark MoE Serving Throughput
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-
-      - name: Benchmark MoE Serving Throughput (w/o RadixAttention)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  unit-test-jobs:
+  run-test:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    strategy:
@@ -48,9 +48,9 @@ jobs:
            python3 run_suite.py --suite minimal --range-begin 8
          fi

-  unit-test:
-    needs: unit-test-jobs
+  finish:
+    needs: [run-test]
    runs-on: ubuntu-latest
    steps:
-      - name: Merge step
-        run: echo "This is an empty merge step"
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ## plot the results in series of lines:
 python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"

-
 # Usage (correctness test):
 python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct

 ## Reference output (of the correctness test above, can be gpu dependent):
-prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
-        [-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
-        [ -9.1875, -10.2500,   2.7109,  ...,  -4.3359,  -4.0664,  -4.1328]],
-       device='cuda:0', dtype=torch.float16)
-prefill logits (final) tensor([[-8.3203, -7.1211,  3.3379,  ..., -4.9570, -4.1328, -3.4141],
-        [-8.9062, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0742],
-        [-9.6328, -9.0547,  4.0117,  ..., -5.3047, -4.7148, -4.4609]],
-       device='cuda:0', dtype=torch.float16)
-<s> The capital of France is.
+input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
+
+prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
+       device='cuda:0')
+
+prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
+        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
+        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
+       device='cuda:0')
+
+========== Prompt 0 ==========
+<s> The capital of France is Paris.
 The capital of the United States is Washington, D.C.

-<s> The capital of the United Kindom is.
+
+========== Prompt 1 ==========
+<s> The capital of the United Kindom is London.
 The capital of the United Kingdom is London.
 The capital of the
-<s> Today is a sunny day and I like go for a walk in the park.
+
+========== Prompt 2 ==========
+<s> Today is a sunny day and I like to go for a walk in the park.
 I'm going to the park
 """

@@ -225,12 +233,12 @@ def correctness_test(

    # Prepare inputs
    input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
-    rank_print(f"{input_ids=}")
+    rank_print(f"\n{input_ids=}\n")

    if bench_args.cut_len > 0:
        # Prefill
        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-        rank_print("prefill logits (first half)", next_token_logits)
+        rank_print(f"prefill logits (first half): {next_token_logits} \n")

    # Prepare extend inputs
    reqs = prepare_extend_inputs_for_correctness_test(
@@ -239,7 +247,7 @@ def correctness_test(

    # Extend
    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-    rank_print("prefill logits (final)", next_token_logits)
+    rank_print(f"prefill logits (final): {next_token_logits} \n")

    # Decode
    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
@@ -250,7 +258,8 @@ def correctness_test(

    # Print
    for i in range(len(reqs)):
-        rank_print(tokenizer.decode(output_ids[i]))
+        rank_print(f"========== Prompt {i} ==========")
+        rank_print(tokenizer.decode(output_ids[i]), "\n")


@torch.inference_mode()
--- a/test/srt/test_moe_eval_accuracy_large.py
+++ b/test/srt/test_moe_eval_accuracy_large.py
@@ -0,0 +1,73 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestEvalAccuracyLarge(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--log-level-http",
+                "warning",
+                "--tp",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=3000,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.63, f"{metrics}"
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.43, f"{metrics}"
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.64, f"{metrics}"
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_latency.py
+++ b/test/srt/test_moe_serving_latency.py
@@ -0,0 +1,45 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
+
+
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+            "--tp",
+            "2",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 125
+
+        kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
            other_args.append("--disable-flashinfer")
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
        other_args.extend(["--tensor-parallel-size", "2"])
-        other_args.append("--enable-p2p-check")

        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
        base_url = DEFAULT_URL_FOR_TEST
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        # Run benchmark
-        num_prompts = 200
+        num_prompts = 300
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1800
-            assert res["output_throughput"] > 1750
+            assert res["output_throughput"] > 1850

    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1900
-            assert res["output_throughput"] > 1850
-
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
+            assert res["output_throughput"] > 1950


 if __name__ == "__main__":
--- a/test/srt/test_serving_latency.py
+++ b/test/srt/test_serving_latency.py
@@ -0,0 +1,43 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
+
+
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 130
+
+        kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        # Run benchmark
-        num_prompts = 400
+        num_prompts = 500
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
-            assert res["output_throughput"] > 2500
+            assert res["output_throughput"] > 2400

    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1500, H100 (SMX): 2850
            assert res["output_throughput"] > 2800

    def test_default_without_chunked_prefill(self):
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
-            assert res["output_throughput"] > 2500
-
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
+            assert res["output_throughput"] > 2400


 if __name__ == "__main__":