Improve end-to-end throughput test and its coverage (#1039)

2024-08-11 18:27:33 -07:00
parent 7de6034534
commit 8207637029
14 changed files with 224 additions and 46 deletions
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -37,23 +37,16 @@ jobs:
    - name: Benchmark Serving Throughput
      run: |
-        python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache &
+        cd test/srt
-        SERVER_PID=$!
+        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
-        echo "Waiting for server to start..."
+    - name: Benchmark Serving Throughput (w/o RadixAttention)
-        for i in {1..120}; do
+      run: |
-          if curl -s http://127.0.0.1:8413/health; then
+        cd test/srt
-            echo "Server is up!"
+        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
            break
          fi
          if [ $i -eq 120 ]; then
            echo "Server failed to start within 120 seconds"
            exit 1
          fi
          sleep 1
        done
-        cd $HOME && python3 -m sglang.bench_serving --backend sglang --port 8413 --dataset-name random --num-prompts 500 --random-input 4096 --random-output 2048
+    - name: Benchmark Serving Throughput (w/o FlashInfer)
      run: |
        cd test/srt
        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer
        echo "Stopping server..."
        kill -9 $SERVER_PID
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -39,6 +39,8 @@ from transformers import (
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 global args
@dataclass
 class RequestFuncInput:
@@ -749,7 +751,11 @@ def check_chat_template(model_path):
        return False
-def fire(args: argparse.Namespace):
+def run_benchmark(args_: argparse.Namespace):
    global args
    args = args_
    set_ulimit()
    random.seed(args.seed)
    np.random.seed(args.seed)
@@ -853,7 +859,7 @@ def fire(args: argparse.Namespace):
                )
            )
    else:
-        asyncio.run(
+        return asyncio.run(
            benchmark(
                backend=backend,
                api_url=api_url,
@@ -962,11 +968,6 @@ if __name__ == "__main__":
        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
    )
    parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
    parser.add_argument(
        "--disable-tqdm",
        action="store_true",
        help="Specify to disable tqdm progress bar.",
    )
    parser.add_argument(
        "--multi",
        action="store_true",
@@ -979,6 +980,11 @@ if __name__ == "__main__":
        help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
    )
    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
    parser.add_argument(
        "--disable-tqdm",
        action="store_true",
        help="Specify to disable tqdm progress bar.",
    )
    parser.add_argument(
        "--disable-stream",
        action="store_true",
@@ -996,8 +1002,5 @@ if __name__ == "__main__":
        help="Append given JSON object to the request payload. You can use this to specify"
        "additional generate params like sampling params.",
    )
    set_ulimit()
    args = parser.parse_args()
-    fire(args)
+    run_benchmark(args)
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -21,6 +21,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -3,7 +3,11 @@ from types import SimpleNamespace
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestAccuracy(unittest.TestCase):
@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
--- a/test/srt/test_embedding_openai_server.py
+++ b/test/srt/test_embedding_openai_server.py
@@ -4,7 +4,7 @@ import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import popen_launch_server
+from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
 class TestOpenAIServer(unittest.TestCase):
@@ -12,7 +12,7 @@ class TestOpenAIServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "intfloat/e5-mistral-7b-instruct"
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, api_key=cls.api_key
--- a/test/srt/test_eval_accuracy.py
+++ b/test/srt/test_eval_accuracy.py
@@ -3,7 +3,11 @@ from types import SimpleNamespace
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestAccuracy(unittest.TestCase):
@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
    @classmethod
--- a/test/srt/test_large_max_new_tokens.py
+++ b/test/srt/test_large_max_new_tokens.py
@@ -8,7 +8,11 @@ import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestOpenAIServer(unittest.TestCase):
@@ -16,7 +20,7 @@ class TestOpenAIServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -6,7 +6,11 @@ import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestOpenAIServer(unittest.TestCase):
@@ -14,7 +18,7 @@ class TestOpenAIServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, api_key=cls.api_key
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -0,0 +1,92 @@
 import unittest
 from types import SimpleNamespace
 from sglang.bench_serving import run_benchmark
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 class TestServingThroughput(unittest.TestCase):
    def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
        # Launch the server
        other_args = []
        if disable_radix_cache:
            other_args.append("--disable-radix-cache")
        if disable_flashinfer:
            other_args.append("--disable-flashinfer")
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
        model = DEFAULT_MODEL_NAME_FOR_TEST
        base_url = "http://127.0.0.1:9157"
        process = popen_launch_server(
            model, base_url, timeout=300, other_args=other_args
        )
        # Run benchmark
        num_prompts = 400
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
            host=None,
            port=None,
            dataset_name="random",
            dataset_path="",
            model=None,
            tokenizer=None,
            num_prompts=num_prompts,
            sharegpt_output_len=None,
            random_input_len=4096,
            random_output_len=2048,
            random_range_ratio=0.0,
            request_rate=float("inf"),
            multi=None,
            seed=0,
            output_file=None,
            disable_tqdm=False,
            disable_stream=False,
            disable_ignore_eos=False,
            extra_request_body=None,
        )
        try:
            res = run_benchmark(args)
        finally:
            kill_child_process(process.pid)
        assert res["completed"] == num_prompts
    def test_default(self):
        self.run_test(
            disable_radix_cache=False,
            disable_flashinfer=False,
            chunked_prefill_size=-1,
        )
    def test_default_without_radix_cache(self):
        self.run_test(
            disable_radix_cache=True,
            disable_flashinfer=False,
            chunked_prefill_size=-1,
        )
    def test_default_without_flashinfer(self):
        self.run_test(
            disable_radix_cache=False,
            disable_flashinfer=True,
            chunked_prefill_size=-1,
        )
    def test_all_cases(self):
        for disable_radix_cache in [False, True]:
            for disable_flashinfer in [False, True]:
                for chunked_prefill_size in [-1, 2048]:
                    self.run_test(
                        disable_radix_cache=False,
                        disable_flashinfer=False,
                        chunked_prefill_size=-1,
                    )
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_skip_tokenizer_init.py
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -4,7 +4,11 @@ import unittest
 import requests
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestSkipTokenizerInit(unittest.TestCase):
@@ -12,7 +16,7 @@ class TestSkipTokenizerInit(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"]
        )
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -4,7 +4,11 @@ import unittest
 import requests
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestSRTEndpoint(unittest.TestCase):
@@ -12,7 +16,7 @@ class TestSRTEndpoint(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
    @classmethod
--- a/test/srt/test_throughput.py
+++ b/test/srt/test_throughput.py
@@ -0,0 +1,61 @@
 import json
 import unittest
 import requests
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 class TestSRTEndpoint(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = "http://127.0.0.1:8157"
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
    @classmethod
    def tearDownClass(cls):
        kill_child_process(cls.process.pid)
    def run_decode(
        self, return_logprob=False, top_logprobs_num=0, return_text=False, n=1
    ):
        response = requests.post(
            self.base_url + "/generate",
            json={
                "text": "The capital of France is",
                "sampling_params": {
                    "temperature": 0 if n == 1 else 0.5,
                    "max_new_tokens": 32,
                    "n": n,
                },
                "stream": False,
                "return_logprob": return_logprob,
                "top_logprobs_num": top_logprobs_num,
                "return_text_in_logprobs": return_text,
                "logprob_start_len": 0,
            },
        )
        print(json.dumps(response.json()))
        print("=" * 100)
    def test_simple_decode(self):
        self.run_decode()
    def test_parallel_sample(self):
        self.run_decode(n=3)
    def test_logprob(self):
        for top_logprobs_num in [0, 3]:
            for return_text in [True, False]:
                self.run_decode(
                    return_logprob=True,
                    top_logprobs_num=top_logprobs_num,
                    return_text=return_text,
                )
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -3,7 +3,11 @@ from types import SimpleNamespace
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestAccuracy(unittest.TestCase):
@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
        )
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -5,7 +5,7 @@ import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import popen_launch_server
+from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
 class TestOpenAIVisionServer(unittest.TestCase):
@@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
-        cls.base_url = "http://127.0.0.1:8157"
+        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,