Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -12,7 +12,6 @@ suites = {
        "models/test_generation_models.py",
        "models/test_qwen_models.py",
        "models/test_reward_models.py",
-        "sampling/penaltylib",
        "test_abort.py",
        "test_chunked_prefill.py",
        "test_custom_allreduce.py",
@@ -31,6 +30,7 @@ suites = {
        "test_no_chunked_prefill.py",
        "test_no_overlap_scheduler.py",
        "test_openai_server.py",
+        "test_penalty.py",
        "test_pytorch_sampling_backend.py",
        "test_radix_attention.py",
        "test_regex_constrained.py",
@@ -38,7 +38,8 @@ suites = {
        "test_request_length_validation.py",
        "test_retract_decode.py",
        "test_server_args.py",
-        "test_session_control.py",
+        # Disabled temporarily
+        # "test_session_control.py",
        "test_skip_tokenizer_init.py",
        "test_srt_engine.py",
        "test_srt_endpoint.py",
@@ -64,9 +65,6 @@ suites = {
        # Disable temporarily
        # "test_nightly_math_eval.py",
    ],
-    "sampling/penaltylib": glob.glob(
-        "sampling/penaltylib/**/test_*.py", recursive=True
-    ),
 }

 # Expand suite
@@ -83,7 +81,7 @@ if __name__ == "__main__":
    arg_parser.add_argument(
        "--timeout-per-file",
        type=int,
-        default=2000,
+        default=1800,
        help="The time limit for running one file in seconds.",
    )
    arg_parser.add_argument(