Fix the overhead due to penalizer in bench_latency (#1496)

This commit is contained in:
Lianmin Zheng
2024-09-23 07:38:14 -07:00
committed by GitHub
parent 42a2d82ba7
commit 2854a5ea9f
6 changed files with 9 additions and 16 deletions

View File

@@ -260,7 +260,7 @@ def correctness_test(
# Decode
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
for _ in range(bench_args.output_len[0]):
for _ in range(bench_args.output_len[0] - 1):
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
for i in range(len(reqs)):
output_ids[i].append(next_token_ids[i])
@@ -311,7 +311,7 @@ def latency_test_run_once(
# Decode
decode_latencies = []
for i in range(output_len):
for i in range(output_len - 1):
torch.cuda.synchronize()
tic = time.time()
next_token_ids, _ = decode(next_token_ids, batch, model_runner)