Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
This commit is contained in:
Lianmin Zheng
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions

View File

@@ -138,6 +138,7 @@ class TestBenchServing(unittest.TestCase):
model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
num_prompts=50,
request_rate=1,
sharegpt_context_len=3072,
disable_ignore_eos=True,
dataset_name="sharegpt",
other_server_args=[
@@ -148,22 +149,23 @@ class TestBenchServing(unittest.TestCase):
"--speculative-num-steps",
"5",
"--speculative-eagle-topk",
"8",
"4",
"--speculative-num-draft-tokens",
"64",
"16",
"--mem-fraction-static",
"0.7",
"--cuda-graph-max-bs",
"32",
],
need_warmup=True,
)
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_eagle\n"
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length : {res["accept_length"]:.2f} \n'
)
self.assertLess(res["median_e2e_latency_ms"], 450)
self.assertLess(res["median_e2e_latency_ms"], 700)
self.assertGreater(res["accept_length"], 2.50)
def test_moe_offline_throughput_default(self):
res = run_bench_serving(