Fix the overhead due to penalizer in bench_latency (#1496)
This commit is contained in:
@@ -260,7 +260,7 @@ def correctness_test(
|
||||
|
||||
# Decode
|
||||
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
|
||||
for _ in range(bench_args.output_len[0]):
|
||||
for _ in range(bench_args.output_len[0] - 1):
|
||||
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
||||
for i in range(len(reqs)):
|
||||
output_ids[i].append(next_token_ids[i])
|
||||
@@ -311,7 +311,7 @@ def latency_test_run_once(
|
||||
|
||||
# Decode
|
||||
decode_latencies = []
|
||||
for i in range(output_len):
|
||||
for i in range(output_len - 1):
|
||||
torch.cuda.synchronize()
|
||||
tic = time.time()
|
||||
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
||||
|
||||
Reference in New Issue
Block a user