[Fix] Fix major performance bug in certain cases (#1563)

Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
This commit is contained in:
Ying Sheng
2024-10-04 01:51:11 -07:00
committed by GitHub
parent 2432ad40c6
commit 04b262cd91
5 changed files with 50 additions and 18 deletions

View File

@@ -20,7 +20,22 @@ class TestBenchServing(unittest.TestCase):
)
if is_in_ci():
assert res["output_throughput"] > 2600
assert res["output_throughput"] > 2830
def test_offline_throughput_non_stream_small_batch_size(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=200,
request_rate=float("inf"),
dataset_name="sharegpt",
random_input_len=None,
random_output_len=None,
disable_stream=True,
other_server_args=["--max-running-requests", "10"],
)
if is_in_ci():
assert res["output_throughput"] > 1000
def test_offline_throughput_without_radix_cache(self):
res = run_bench_serving(
@@ -31,7 +46,7 @@ class TestBenchServing(unittest.TestCase):
)
if is_in_ci():
assert res["output_throughput"] > 2800
assert res["output_throughput"] > 2880
def test_offline_throughput_without_chunked_prefill(self):
res = run_bench_serving(
@@ -58,7 +73,7 @@ class TestBenchServing(unittest.TestCase):
)
if is_in_ci():
assert res["output_throughput"] > 2600
assert res["output_throughput"] > 2930
def test_offline_throughput_default_fp8(self):
res = run_bench_serving(