Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
This commit is contained in:
Lianmin Zheng
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions

View File

@@ -12,7 +12,6 @@ suites = {
"models/test_generation_models.py",
"models/test_qwen_models.py",
"models/test_reward_models.py",
"sampling/penaltylib",
"test_abort.py",
"test_chunked_prefill.py",
"test_custom_allreduce.py",
@@ -31,6 +30,7 @@ suites = {
"test_no_chunked_prefill.py",
"test_no_overlap_scheduler.py",
"test_openai_server.py",
"test_penalty.py",
"test_pytorch_sampling_backend.py",
"test_radix_attention.py",
"test_regex_constrained.py",
@@ -38,7 +38,8 @@ suites = {
"test_request_length_validation.py",
"test_retract_decode.py",
"test_server_args.py",
"test_session_control.py",
# Disabled temporarily
# "test_session_control.py",
"test_skip_tokenizer_init.py",
"test_srt_engine.py",
"test_srt_endpoint.py",
@@ -64,9 +65,6 @@ suites = {
# Disable temporarily
# "test_nightly_math_eval.py",
],
"sampling/penaltylib": glob.glob(
"sampling/penaltylib/**/test_*.py", recursive=True
),
}
# Expand suite
@@ -83,7 +81,7 @@ if __name__ == "__main__":
arg_parser.add_argument(
"--timeout-per-file",
type=int,
default=2000,
default=1800,
help="The time limit for running one file in seconds.",
)
arg_parser.add_argument(