Re-organize CI tests (#1052)

2024-08-12 03:39:01 -07:00
parent 0c1c72a0b4
commit c877292cc1
9 changed files with 148 additions and 22 deletions
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -45,8 +45,7 @@ jobs:
        cd test/srt
        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache

-    - name: Benchmark Serving Throughput (w/o FlashInfer)
+    - name: Benchmark Serving Throughput (w/ ChunkedPrefill)
      run: |
        cd test/srt
-        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer
-
+        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill
--- a/python/sglang/srt/constrained/base_tool_cache.py
+++ b/python/sglang/srt/constrained/base_tool_cache.py
@@ -54,7 +54,7 @@ class BaseToolCache:
        return val

    def init_value(self, key):
-        raise NotImplementedError
+        raise NotImplementedError()

    def get_cache_hit_rate(self):
        if self.metrics["total"] == 0:
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -410,13 +410,16 @@ class ModelTpServer:

        # Print stats
        if self.tp_rank == 0:
-            self.tree_cache_metrics["total"] += (
-                adder.log_input_tokens + adder.log_hit_tokens
-            ) / 10**9
-            self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
-            tree_cache_hit_rate = (
-                self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
-            )
+            if isinstance(self.tree_cache, RadixCache):
+                self.tree_cache_metrics["total"] += (
+                    adder.log_input_tokens + adder.log_hit_tokens
+                ) / 10**9
+                self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
+                tree_cache_hit_rate = (
+                    self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
+                )
+            else:
+                tree_cache_hit_rate = 0.0
            logger.info(
                f"[gpu={self.gpu_id}] Prefill batch. "
                f"#new-seq: {len(can_run_list)}, "
--- a/python/sglang/srt/mem_cache/chunk_cache.py
+++ b/python/sglang/srt/mem_cache/chunk_cache.py
@@ -68,7 +68,7 @@ class ChunkCache(BasePrefixCache):
        req.last_node = entry

    def insert(self):
-        raise NotImplementedError
+        raise NotImplementedError()

    def evict(self, num_tokens: int, evict_callback: Callable):
        pass
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -447,6 +447,15 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
        print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
        sys.exit(1)

+    # Print warnings here
+    if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None:
+        logger.warning(
+            "You set both `--disable-radix-cache` and `--chunked-prefill-size`. "
+            "This combination is an experimental feature and we noticed it can lead to "
+            "wrong generation results. If you want to use chunked prefill, it is recommended "
+            "not using `--disable-radix-cache`."
+        )
+
    logger.info("The server is fired up and ready to roll!")
    if pipe_finish_writer is not None:
        pipe_finish_writer.send("init ok")
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -12,6 +12,7 @@ suites = {
        "test_openai_server.py",
        "test_skip_tokenizer_init.py",
        "test_torch_compile.py",
+        "test_triton_attn_backend.py",
        "test_vision_openai_server.py",
        "test_large_max_new_tokens.py",
        "models/test_generation_models.py",
--- a/test/srt/test_eval_accuracy_large_chunked_prefill.py
+++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py
@@ -0,0 +1,68 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = "http://127.0.0.1:7157"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=300,
+            other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=3000,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.71, f"{metrics}"
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.65, f"{metrics}"
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.85, f"{metrics}"
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -3,6 +3,7 @@ import unittest
 from types import SimpleNamespace

 from sglang.bench_serving import run_benchmark
+from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server

@@ -60,9 +61,9 @@ class TestServingThroughput(unittest.TestCase):

    def test_default(self):
        res = self.run_test(
-            disable_radix_cache=False,
-            disable_flashinfer=False,
-            chunked_prefill_size=-1,
+            disable_radix_cache=ServerArgs.disable_radix_cache,
+            disable_flashinfer=ServerArgs.disable_flashinfer,
+            chunked_prefill_size=ServerArgs.chunked_prefill_size,
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
@@ -72,21 +73,25 @@ class TestServingThroughput(unittest.TestCase):
    def test_default_without_radix_cache(self):
        res = self.run_test(
            disable_radix_cache=True,
-            disable_flashinfer=False,
-            chunked_prefill_size=-1,
+            disable_flashinfer=ServerArgs.disable_flashinfer,
+            chunked_prefill_size=ServerArgs.chunked_prefill_size,
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
            assert res["output_throughput"] >= 1450

-    def test_default_without_flashinfer(self):
-        self.run_test(
-            disable_radix_cache=False,
-            disable_flashinfer=True,
-            chunked_prefill_size=-1,
+    def test_default_with_chunked_prefill(self):
+        res = self.run_test(
+            disable_radix_cache=ServerArgs.disable_radix_cache,
+            disable_flashinfer=ServerArgs.disable_flashinfer,
+            chunked_prefill_size=8192,
        )

+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 (PCIE) performance
+            assert res["output_throughput"] >= 1400
+
    def test_all_cases(self):
        for disable_radix_cache in [False, True]:
            for disable_flashinfer in [False, True]:
--- a/test/srt/test_triton_attn_backend.py
+++ b/test/srt/test_triton_attn_backend.py
@@ -0,0 +1,41 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestTritonAttnBackend(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"]
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=32,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.6
+
+
+if __name__ == "__main__":
+    unittest.main()