[Fix] Fix major performance bug in certain cases (#1563)

Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
2024-10-04 01:51:11 -07:00
parent 2432ad40c6
commit 04b262cd91
5 changed files with 50 additions and 18 deletions
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -20,7 +20,22 @@ class TestBenchServing(unittest.TestCase):
        )

        if is_in_ci():
-            assert res["output_throughput"] > 2600
+            assert res["output_throughput"] > 2830
+
+    def test_offline_throughput_non_stream_small_batch_size(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=200,
+            request_rate=float("inf"),
+            dataset_name="sharegpt",
+            random_input_len=None,
+            random_output_len=None,
+            disable_stream=True,
+            other_server_args=["--max-running-requests", "10"],
+        )
+
+        if is_in_ci():
+            assert res["output_throughput"] > 1000

    def test_offline_throughput_without_radix_cache(self):
        res = run_bench_serving(
@@ -31,7 +46,7 @@ class TestBenchServing(unittest.TestCase):
        )

        if is_in_ci():
-            assert res["output_throughput"] > 2800
+            assert res["output_throughput"] > 2880

    def test_offline_throughput_without_chunked_prefill(self):
        res = run_bench_serving(
@@ -58,7 +73,7 @@ class TestBenchServing(unittest.TestCase):
        )

        if is_in_ci():
-            assert res["output_throughput"] > 2600
+            assert res["output_throughput"] > 2930

    def test_offline_throughput_default_fp8(self):
        res = run_bench_serving(