Fix the overhead due to penalizer in bench_latency (#1496)

2024-09-23 07:38:14 -07:00
parent 42a2d82ba7
commit 2854a5ea9f
6 changed files with 9 additions and 16 deletions
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -260,7 +260,7 @@ def correctness_test(

    # Decode
    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
-    for _ in range(bench_args.output_len[0]):
+    for _ in range(bench_args.output_len[0] - 1):
        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
        for i in range(len(reqs)):
            output_ids[i].append(next_token_ids[i])
@@ -311,7 +311,7 @@ def latency_test_run_once(

    # Decode
    decode_latencies = []
-    for i in range(output_len):
+    for i in range(output_len - 1):
        torch.cuda.synchronize()
        tic = time.time()
        next_token_ids, _ = decode(next_token_ids, batch, model_runner)