ci: add moe test (#1053)

2024-08-13 16:43:23 +08:00
parent 396a13e6ad
commit f7fb68d292
16 changed files with 195 additions and 44 deletions
--- a/.github/workflows/moe-test.yml
+++ b/.github/workflows/moe-test.yml
@@ -0,0 +1,42 @@
 name: MoE Test
 on:
  push:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  pull_request:
    branches: [ main ]
    paths:
      - "python/sglang/**"
      - "test/**"
  workflow_dispatch:
 concurrency:
  group: moe-test-${{ github.ref }}
  cancel-in-progress: true
 jobs:
    moe-test:
        if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
        runs-on: accuracy
        steps:
            - name: Checkout code
              uses: actions/checkout@v3
            - name: Install dependencies
              run: |
                source $HOME/venv/bin/activate
                echo "$HOME/venv/bin" >> $GITHUB_PATH
                pip install --upgrade pip
                pip install -e "python[all]"
                pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
            - name: Benchmark MOE Serving Throughput
              run: |
                cd test/srt
                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -21,7 +21,11 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
+DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
 DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
 DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
 DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -5,20 +5,19 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestChunkedPrefill(unittest.TestCase):
    def run_mmlu(self, disable_radix_cache):
        other_args = ["--chunked-prefill-size", "32"]
        if disable_radix_cache:
            other_args += ["--disable-radix-cache"]
        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
+        base_url = DEFAULT_URL_FOR_UNIT_TEST
        process = popen_launch_server(
            model,
            base_url,
--- a/test/srt/test_embedding_openai_server.py
+++ b/test/srt/test_embedding_openai_server.py
@@ -4,15 +4,14 @@ import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server
 class TestOpenAIServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "intfloat/e5-mistral-7b-instruct"
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, api_key=cls.api_key
--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_ACCURACY_TEST,
    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestEvalAccuracyLarge(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:7157"
+        cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
@@ -49,7 +49,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
        )
        metrics = run_eval(args)
-        assert metrics["score"] >= 0.65, f"{metrics}"
+        assert metrics["score"] >= 0.64, f"{metrics}"
    def test_mgsm_en(self):
        args = SimpleNamespace(
@@ -61,7 +61,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
        )
        metrics = run_eval(args)
-        assert metrics["score"] >= 0.85, f"{metrics}"
+        assert metrics["score"] >= 0.84, f"{metrics}"
 if __name__ == "__main__":
--- a/test/srt/test_eval_accuracy_large_chunked_prefill.py
+++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py
@@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_ACCURACY_TEST,
    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:7157"
+        cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
@@ -49,7 +49,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
        )
        metrics = run_eval(args)
-        assert metrics["score"] >= 0.65, f"{metrics}"
+        assert metrics["score"] >= 0.64, f"{metrics}"
    def test_mgsm_en(self):
        args = SimpleNamespace(
@@ -61,7 +61,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
        )
        metrics = run_eval(args)
-        assert metrics["score"] >= 0.85, f"{metrics}"
+        assert metrics["score"] >= 0.84, f"{metrics}"
 if __name__ == "__main__":
--- a/test/srt/test_eval_accuracy_mini.py
+++ b/test/srt/test_eval_accuracy_mini.py
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestEvalAccuracyMini(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
    @classmethod
--- a/test/srt/test_large_max_new_tokens.py
+++ b/test/srt/test_large_max_new_tokens.py
@@ -10,17 +10,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestOpenAIServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -0,0 +1,112 @@
 import os
 import unittest
 from types import SimpleNamespace
 from sglang.bench_serving import run_benchmark
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_MOE_TEST,
    popen_launch_server,
 )
 class TestServingThroughput(unittest.TestCase):
    def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
        # Launch the server
        other_args = []
        if disable_radix_cache:
            other_args.append("--disable-radix-cache")
        if disable_flashinfer:
            other_args.append("--disable-flashinfer")
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
        other_args.extend(["--tensor-parallel-size", "2"])
        other_args.append("--enable-p2p-check")
        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
        base_url = DEFAULT_URL_FOR_MOE_TEST
        process = popen_launch_server(
            model, base_url, timeout=300, other_args=other_args
        )
        # Run benchmark
        num_prompts = 400
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
            host=None,
            port=None,
            dataset_name="random",
            dataset_path="",
            model=None,
            tokenizer=None,
            num_prompts=num_prompts,
            sharegpt_output_len=None,
            random_input_len=4096,
            random_output_len=2048,
            random_range_ratio=0.0,
            request_rate=float("inf"),
            multi=None,
            seed=0,
            output_file=None,
            disable_tqdm=False,
            disable_stream=False,
            disable_ignore_eos=False,
            extra_request_body=None,
        )
        try:
            res = run_benchmark(args)
        finally:
            kill_child_process(process.pid)
        assert res["completed"] == num_prompts
        return res
    def test_default(self):
        res = self.run_test(
            disable_radix_cache=ServerArgs.disable_radix_cache,
            disable_flashinfer=ServerArgs.disable_flashinfer,
            chunked_prefill_size=ServerArgs.chunked_prefill_size,
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
            assert res["output_throughput"] > 950
    def test_default_without_radix_cache(self):
        res = self.run_test(
            disable_radix_cache=True,
            disable_flashinfer=ServerArgs.disable_flashinfer,
            chunked_prefill_size=ServerArgs.chunked_prefill_size,
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
            assert res["output_throughput"] > 950
    def test_default_with_chunked_prefill(self):
        res = self.run_test(
            disable_radix_cache=ServerArgs.disable_radix_cache,
            disable_flashinfer=ServerArgs.disable_flashinfer,
            chunked_prefill_size=8192,
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
            print(res["output_throughput"])
    def test_all_cases(self):
        for disable_radix_cache in [False, True]:
            for disable_flashinfer in [False, True]:
                for chunked_prefill_size in [-1, 2048]:
                    self.run_test(
                        disable_radix_cache=False,
                        disable_flashinfer=False,
                        chunked_prefill_size=-1,
                    )
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -8,17 +8,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestOpenAIServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, api_key=cls.api_key
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -5,11 +5,14 @@ from types import SimpleNamespace
 from sglang.bench_serving import run_benchmark
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_E2E_TEST,
    popen_launch_server,
 )
 class TestServingThroughput(unittest.TestCase):
    def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
        # Launch the server
        other_args = []
@@ -20,7 +23,7 @@ class TestServingThroughput(unittest.TestCase):
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = "http://127.0.0.1:9157"
+        base_url = DEFAULT_URL_FOR_E2E_TEST
        process = popen_launch_server(
            model, base_url, timeout=300, other_args=other_args
        )
--- a/test/srt/test_skip_tokenizer_init.py
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -6,17 +6,16 @@ import requests
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestSkipTokenizerInit(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"]
        )
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -6,17 +6,16 @@ import requests
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestSRTEndpoint(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
    @classmethod
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestTorchCompile(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
        )
--- a/test/srt/test_triton_attn_backend.py
+++ b/test/srt/test_triton_attn_backend.py
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_URL_FOR_TEST,
+    DEFAULT_URL_FOR_UNIT_TEST,
    popen_launch_server,
 )
 class TestTritonAttnBackend(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"]
        )
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -5,15 +5,14 @@ import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server
 class TestOpenAIVisionServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
-        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,