Allow skipping warmup in bench_offline_throughput.py (#2103)

This commit is contained in:
Lianmin Zheng
2024-11-20 01:25:21 -08:00
committed by GitHub
parent 5942dfc00a
commit 3295cd8af2
2 changed files with 18 additions and 11 deletions

View File

@@ -57,6 +57,7 @@ class BenchArgs:
disable_ignore_eos: bool = False
extra_request_body: Optional[str] = None
seed: int = 1
skip_warmup: bool = False
do_not_exit: bool = False
@staticmethod
@@ -152,6 +153,11 @@ class BenchArgs:
"additional generate params like sampling params.",
)
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
parser.add_argument(
"--skip-warmup",
action="store_true",
help="Skip the warmup batches.",
)
parser.add_argument(
"--do-not-exit",
action="store_true",
@@ -261,14 +267,15 @@ def throughput_test(
)
# Warm up
logging.info("\nWarmup...")
throughput_test_once(
backend_name=bench_args.backend,
backend=backend,
reqs=warmup_requests,
ignore_eos=not bench_args.disable_ignore_eos,
extra_request_body=extra_request_body,
)
if not bench_args.skip_warmup:
logging.info("\nWarmup...")
throughput_test_once(
backend_name=bench_args.backend,
backend=backend,
reqs=warmup_requests,
ignore_eos=not bench_args.disable_ignore_eos,
extra_request_body=extra_request_body,
)
logging.info("\nBenchmark...")
result = throughput_test_once(

View File

@@ -156,9 +156,6 @@ class TpModelWorkerClient:
return logits_output, next_token_ids
def forward_batch_generation(self, model_worker_batch: ModelWorkerBatch):
# A cuda stream sync here to avoid the cuda illegal memory access error.
torch.cuda.current_stream().synchronize()
# Create a new copy of sampling_info because it will be updated in-place by the scheduler for the next batch.
sampling_info = model_worker_batch.sampling_info
sampling_info.update_penalties()
@@ -169,6 +166,9 @@ class TpModelWorkerClient:
linear_penalties=sampling_info.linear_penalties,
)
# A cuda stream sync here to avoid the cuda illegal memory access error.
torch.cuda.current_stream().synchronize()
# Push a new batch to the queue
self.input_queue.put((model_worker_batch, self.future_token_ids_ct))