From f7fb68d2925201ce234e97d81ad3095e4dc48cbb Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 16:43:23 +0800 Subject: [PATCH] ci: add moe test (#1053) --- .github/workflows/moe-test.yml | 42 +++++++ python/sglang/test/test_utils.py | 6 +- test/srt/test_chunked_prefill.py | 5 +- test/srt/test_embedding_openai_server.py | 5 +- test/srt/test_eval_accuracy_large.py | 10 +- ...est_eval_accuracy_large_chunked_prefill.py | 10 +- test/srt/test_eval_accuracy_mini.py | 5 +- test/srt/test_large_max_new_tokens.py | 5 +- test/srt/test_moe_serving_throughput.py | 112 ++++++++++++++++++ test/srt/test_openai_server.py | 5 +- test/srt/test_serving_throughput.py | 9 +- test/srt/test_skip_tokenizer_init.py | 5 +- test/srt/test_srt_endpoint.py | 5 +- test/srt/test_torch_compile.py | 5 +- test/srt/test_triton_attn_backend.py | 5 +- test/srt/test_vision_openai_server.py | 5 +- 16 files changed, 195 insertions(+), 44 deletions(-) create mode 100644 .github/workflows/moe-test.yml create mode 100644 test/srt/test_moe_serving_throughput.py diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml new file mode 100644 index 000000000..a781f2eff --- /dev/null +++ b/.github/workflows/moe-test.yml @@ -0,0 +1,42 @@ +name: MoE Test + +on: + push: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + pull_request: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + workflow_dispatch: + +concurrency: + group: moe-test-${{ github.ref }} + cancel-in-progress: true + +jobs: + moe-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: accuracy + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + source $HOME/venv/bin/activate + echo "$HOME/venv/bin" >> $GITHUB_PATH + + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Benchmark MOE Serving Throughput + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 7243ff2ec..66f3e4f35 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -21,7 +21,11 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.utils import get_exception_traceback DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" -DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157" +DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" +DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157" +DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157" +DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157" +DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157" def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 5b2bb4aaa..94c424762 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -5,20 +5,19 @@ from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestChunkedPrefill(unittest.TestCase): - def run_mmlu(self, disable_radix_cache): other_args = ["--chunked-prefill-size", "32"] if disable_radix_cache: other_args += ["--disable-radix-cache"] model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST + base_url = DEFAULT_URL_FOR_UNIT_TEST process = popen_launch_server( model, base_url, diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py index 45580feda..fd8fec48e 100644 --- a/test/srt/test_embedding_openai_server.py +++ b/test/srt/test_embedding_openai_server.py @@ -4,15 +4,14 @@ import openai from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server class TestOpenAIServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = "intfloat/e5-mistral-7b-instruct" - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 556954331..9f99b0b95 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_ACCURACY_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestEvalAccuracyLarge(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:7157" + cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST cls.process = popen_launch_server( cls.model, cls.base_url, @@ -49,7 +49,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): ) metrics = run_eval(args) - assert metrics["score"] >= 0.65, f"{metrics}" + assert metrics["score"] >= 0.64, f"{metrics}" def test_mgsm_en(self): args = SimpleNamespace( @@ -61,7 +61,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): ) metrics = run_eval(args) - assert metrics["score"] >= 0.85, f"{metrics}" + assert metrics["score"] >= 0.84, f"{metrics}" if __name__ == "__main__": diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py index 297fc22e1..040a2db75 100644 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_ACCURACY_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:7157" + cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST cls.process = popen_launch_server( cls.model, cls.base_url, @@ -49,7 +49,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): ) metrics = run_eval(args) - assert metrics["score"] >= 0.65, f"{metrics}" + assert metrics["score"] >= 0.64, f"{metrics}" def test_mgsm_en(self): args = SimpleNamespace( @@ -61,7 +61,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): ) metrics = run_eval(args) - assert metrics["score"] >= 0.85, f"{metrics}" + assert metrics["score"] >= 0.84, f"{metrics}" if __name__ == "__main__": diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py index b5533da37..a4219b1a0 100644 --- a/test/srt/test_eval_accuracy_mini.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestEvalAccuracyMini(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py index 58f82b351..f29adabce 100644 --- a/test/srt/test_large_max_new_tokens.py +++ b/test/srt/test_large_max_new_tokens.py @@ -10,17 +10,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestOpenAIServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py new file mode 100644 index 000000000..6353e5099 --- /dev/null +++ b/test/srt/test_moe_serving_throughput.py @@ -0,0 +1,112 @@ +import os +import unittest +from types import SimpleNamespace + +from sglang.bench_serving import run_benchmark +from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import ( + DEFAULT_MOE_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_MOE_TEST, + popen_launch_server, +) + + +class TestServingThroughput(unittest.TestCase): + def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size): + # Launch the server + other_args = [] + if disable_radix_cache: + other_args.append("--disable-radix-cache") + if disable_flashinfer: + other_args.append("--disable-flashinfer") + other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) + other_args.extend(["--tensor-parallel-size", "2"]) + other_args.append("--enable-p2p-check") + + model = DEFAULT_MOE_MODEL_NAME_FOR_TEST + base_url = DEFAULT_URL_FOR_MOE_TEST + process = popen_launch_server( + model, base_url, timeout=300, other_args=other_args + ) + + # Run benchmark + num_prompts = 400 + args = SimpleNamespace( + backend="sglang", + base_url=base_url, + host=None, + port=None, + dataset_name="random", + dataset_path="", + model=None, + tokenizer=None, + num_prompts=num_prompts, + sharegpt_output_len=None, + random_input_len=4096, + random_output_len=2048, + random_range_ratio=0.0, + request_rate=float("inf"), + multi=None, + seed=0, + output_file=None, + disable_tqdm=False, + disable_stream=False, + disable_ignore_eos=False, + extra_request_body=None, + ) + + try: + res = run_benchmark(args) + finally: + kill_child_process(process.pid) + + assert res["completed"] == num_prompts + return res + + def test_default(self): + res = self.run_test( + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + assert res["output_throughput"] > 950 + + def test_default_without_radix_cache(self): + res = self.run_test( + disable_radix_cache=True, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + assert res["output_throughput"] > 950 + + def test_default_with_chunked_prefill(self): + res = self.run_test( + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=8192, + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + print(res["output_throughput"]) + + def test_all_cases(self): + for disable_radix_cache in [False, True]: + for disable_flashinfer in [False, True]: + for chunked_prefill_size in [-1, 2048]: + self.run_test( + disable_radix_cache=False, + disable_flashinfer=False, + chunked_prefill_size=-1, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index b66c35f01..872424756 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -8,17 +8,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestOpenAIServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index c733163f5..c99d2e07e 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -5,11 +5,14 @@ from types import SimpleNamespace from sglang.bench_serving import run_benchmark from sglang.srt.server_args import ServerArgs from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_E2E_TEST, + popen_launch_server, +) class TestServingThroughput(unittest.TestCase): - def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size): # Launch the server other_args = [] @@ -20,7 +23,7 @@ class TestServingThroughput(unittest.TestCase): other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = "http://127.0.0.1:9157" + base_url = DEFAULT_URL_FOR_E2E_TEST process = popen_launch_server( model, base_url, timeout=300, other_args=other_args ) diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 01bfdb96a..750105615 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -6,17 +6,16 @@ import requests from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestSkipTokenizerInit(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"] ) diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 2c40f5360..5e6bcbf60 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -6,17 +6,16 @@ import requests from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestSRTEndpoint(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 1ea1438fe..5133d3cd3 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestTorchCompile(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] ) diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py index 67cbc623c..7a453d8be 100644 --- a/test/srt/test_triton_attn_backend.py +++ b/test/srt/test_triton_attn_backend.py @@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestTritonAttnBackend(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"] ) diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 0449e33f1..c599d8b36 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -5,15 +5,14 @@ import openai from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server class TestOpenAIVisionServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = "liuhaotian/llava-v1.6-vicuna-7b" - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model,