From f7fb68d2925201ce234e97d81ad3095e4dc48cbb Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Tue, 13 Aug 2024 16:43:23 +0800
Subject: [PATCH] ci: add moe test (#1053)

---
 .github/workflows/moe-test.yml                |  42 +++++++
 python/sglang/test/test_utils.py              |   6 +-
 test/srt/test_chunked_prefill.py              |   5 +-
 test/srt/test_embedding_openai_server.py      |   5 +-
 test/srt/test_eval_accuracy_large.py          |  10 +-
 ...est_eval_accuracy_large_chunked_prefill.py |  10 +-
 test/srt/test_eval_accuracy_mini.py           |   5 +-
 test/srt/test_large_max_new_tokens.py         |   5 +-
 test/srt/test_moe_serving_throughput.py       | 112 ++++++++++++++++++
 test/srt/test_openai_server.py                |   5 +-
 test/srt/test_serving_throughput.py           |   9 +-
 test/srt/test_skip_tokenizer_init.py          |   5 +-
 test/srt/test_srt_endpoint.py                 |   5 +-
 test/srt/test_torch_compile.py                |   5 +-
 test/srt/test_triton_attn_backend.py          |   5 +-
 test/srt/test_vision_openai_server.py         |   5 +-
 16 files changed, 195 insertions(+), 44 deletions(-)
 create mode 100644 .github/workflows/moe-test.yml
 create mode 100644 test/srt/test_moe_serving_throughput.py

diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml
new file mode 100644
index 000000000..a781f2eff
--- /dev/null
+++ b/.github/workflows/moe-test.yml
@@ -0,0 +1,42 @@
+name: MoE Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  workflow_dispatch:
+
+concurrency:
+  group: moe-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+    moe-test:
+        if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+        runs-on: accuracy
+    
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v3
+        
+            - name: Install dependencies
+              run: |
+                source $HOME/venv/bin/activate
+                echo "$HOME/venv/bin" >> $GITHUB_PATH
+        
+                pip install --upgrade pip
+                pip install -e "python[all]"
+                pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+            - name: Benchmark MOE Serving Throughput
+              run: |
+                cd test/srt
+                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 7243ff2ec..66f3e4f35 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -21,7 +21,11 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import get_exception_traceback
 
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
+DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
+DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
+DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
+DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
 
 
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py
index 5b2bb4aaa..94c424762 100644
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -5,20 +5,19 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestChunkedPrefill(unittest.TestCase):
-
     def run_mmlu(self, disable_radix_cache):
         other_args = ["--chunked-prefill-size", "32"]
         if disable_radix_cache:
             other_args += ["--disable-radix-cache"]
 
         model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
+        base_url = DEFAULT_URL_FOR_UNIT_TEST
         process = popen_launch_server(
             model,
             base_url,
diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py
index 45580feda..fd8fec48e 100644
--- a/test/srt/test_embedding_openai_server.py
+++ b/test/srt/test_embedding_openai_server.py
@@ -4,15 +4,14 @@ import openai
 
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server
 
 
 class TestOpenAIServer(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = "intfloat/e5-mistral-7b-instruct"
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.api_key = "sk-123456"
         cls.process = popen_launch_server(
             cls.model, cls.base_url, timeout=300, api_key=cls.api_key
diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py
index 556954331..9f99b0b95 100644
--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_ACCURACY_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestEvalAccuracyLarge(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:7157"
+        cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST
         cls.process = popen_launch_server(
             cls.model,
             cls.base_url,
@@ -49,7 +49,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.65, f"{metrics}"
+        assert metrics["score"] >= 0.64, f"{metrics}"
 
     def test_mgsm_en(self):
         args = SimpleNamespace(
@@ -61,7 +61,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.85, f"{metrics}"
+        assert metrics["score"] >= 0.84, f"{metrics}"
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py
index 297fc22e1..040a2db75 100644
--- a/test/srt/test_eval_accuracy_large_chunked_prefill.py
+++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py
@@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_ACCURACY_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:7157"
+        cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST
         cls.process = popen_launch_server(
             cls.model,
             cls.base_url,
@@ -49,7 +49,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.65, f"{metrics}"
+        assert metrics["score"] >= 0.64, f"{metrics}"
 
     def test_mgsm_en(self):
         args = SimpleNamespace(
@@ -61,7 +61,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.85, f"{metrics}"
+        assert metrics["score"] >= 0.84, f"{metrics}"
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py
index b5533da37..a4219b1a0 100644
--- a/test/srt/test_eval_accuracy_mini.py
+++ b/test/srt/test_eval_accuracy_mini.py
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestEvalAccuracyMini(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
 
     @classmethod
diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py
index 58f82b351..f29adabce 100644
--- a/test/srt/test_large_max_new_tokens.py
+++ b/test/srt/test_large_max_new_tokens.py
@@ -10,17 +10,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestOpenAIServer(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.api_key = "sk-123456"
         cls.process = popen_launch_server(
             cls.model,
diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py
new file mode 100644
index 000000000..6353e5099
--- /dev/null
+++ b/test/srt/test_moe_serving_throughput.py
@@ -0,0 +1,112 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.bench_serving import run_benchmark
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import (
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_MOE_TEST,
+    popen_launch_server,
+)
+
+
+class TestServingThroughput(unittest.TestCase):
+    def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
+        # Launch the server
+        other_args = []
+        if disable_radix_cache:
+            other_args.append("--disable-radix-cache")
+        if disable_flashinfer:
+            other_args.append("--disable-flashinfer")
+        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
+        other_args.extend(["--tensor-parallel-size", "2"])
+        other_args.append("--enable-p2p-check")
+
+        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_MOE_TEST
+        process = popen_launch_server(
+            model, base_url, timeout=300, other_args=other_args
+        )
+
+        # Run benchmark
+        num_prompts = 400
+        args = SimpleNamespace(
+            backend="sglang",
+            base_url=base_url,
+            host=None,
+            port=None,
+            dataset_name="random",
+            dataset_path="",
+            model=None,
+            tokenizer=None,
+            num_prompts=num_prompts,
+            sharegpt_output_len=None,
+            random_input_len=4096,
+            random_output_len=2048,
+            random_range_ratio=0.0,
+            request_rate=float("inf"),
+            multi=None,
+            seed=0,
+            output_file=None,
+            disable_tqdm=False,
+            disable_stream=False,
+            disable_ignore_eos=False,
+            extra_request_body=None,
+        )
+
+        try:
+            res = run_benchmark(args)
+        finally:
+            kill_child_process(process.pid)
+
+        assert res["completed"] == num_prompts
+        return res
+
+    def test_default(self):
+        res = self.run_test(
+            disable_radix_cache=ServerArgs.disable_radix_cache,
+            disable_flashinfer=ServerArgs.disable_flashinfer,
+            chunked_prefill_size=ServerArgs.chunked_prefill_size,
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 (PCIE) performance
+            assert res["output_throughput"] > 950
+
+    def test_default_without_radix_cache(self):
+        res = self.run_test(
+            disable_radix_cache=True,
+            disable_flashinfer=ServerArgs.disable_flashinfer,
+            chunked_prefill_size=ServerArgs.chunked_prefill_size,
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 (PCIE) performance
+            assert res["output_throughput"] > 950
+
+    def test_default_with_chunked_prefill(self):
+        res = self.run_test(
+            disable_radix_cache=ServerArgs.disable_radix_cache,
+            disable_flashinfer=ServerArgs.disable_flashinfer,
+            chunked_prefill_size=8192,
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 (PCIE) performance
+            print(res["output_throughput"])
+
+    def test_all_cases(self):
+        for disable_radix_cache in [False, True]:
+            for disable_flashinfer in [False, True]:
+                for chunked_prefill_size in [-1, 2048]:
+                    self.run_test(
+                        disable_radix_cache=False,
+                        disable_flashinfer=False,
+                        chunked_prefill_size=-1,
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py
index b66c35f01..872424756 100644
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -8,17 +8,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestOpenAIServer(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.api_key = "sk-123456"
         cls.process = popen_launch_server(
             cls.model, cls.base_url, timeout=300, api_key=cls.api_key
diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py
index c733163f5..c99d2e07e 100644
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -5,11 +5,14 @@ from types import SimpleNamespace
 from sglang.bench_serving import run_benchmark
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_E2E_TEST,
+    popen_launch_server,
+)
 
 
 class TestServingThroughput(unittest.TestCase):
-
     def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
         # Launch the server
         other_args = []
@@ -20,7 +23,7 @@ class TestServingThroughput(unittest.TestCase):
         other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
 
         model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = "http://127.0.0.1:9157"
+        base_url = DEFAULT_URL_FOR_E2E_TEST
         process = popen_launch_server(
             model, base_url, timeout=300, other_args=other_args
         )
diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py
index 01bfdb96a..750105615 100644
--- a/test/srt/test_skip_tokenizer_init.py
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -6,17 +6,16 @@ import requests
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestSkipTokenizerInit(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.process = popen_launch_server(
             cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"]
         )
diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py
index 2c40f5360..5e6bcbf60 100644
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -6,17 +6,16 @@ import requests
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestSRTEndpoint(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
 
     @classmethod
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
index 1ea1438fe..5133d3cd3 100644
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestTorchCompile(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.process = popen_launch_server(
             cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
         )
diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py
index 67cbc623c..7a453d8be 100644
--- a/test/srt/test_triton_attn_backend.py
+++ b/test/srt/test_triton_attn_backend.py
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
     popen_launch_server,
 )
 
 
 class TestTritonAttnBackend(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.process = popen_launch_server(
             cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"]
         )
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index 0449e33f1..c599d8b36 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -5,15 +5,14 @@ import openai
 
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server
 
 
 class TestOpenAIVisionServer(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
         cls.api_key = "sk-123456"
         cls.process = popen_launch_server(
             cls.model,