From 35759efa91f39168c6aa255fd9b14fd50aea968b Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 20 Jul 2024 01:06:43 -0700 Subject: [PATCH] Support random dataset in bench_serving.py (#669) --- python/sglang/bench_serving.py | 85 +++++++++++++++++-- .../srt/managers/controller/model_runner.py | 2 +- python/sglang/srt/managers/io_struct.py | 2 +- .../sglang/srt/managers/tokenizer_manager.py | 8 +- 4 files changed, 82 insertions(+), 15 deletions(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 870cca543..c872ec2b6 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -273,6 +273,37 @@ def sample_sharegpt_requests( return filtered_dataset +def sample_random_requests( + input_len: int, + output_len: int, + num_prompts: int, + range_ratio: float, + tokenizer: PreTrainedTokenizerBase, +) -> List[Tuple[str, int, int]]: + + input_lens = np.random.randint( + int(input_len * range_ratio), + input_len + 1, + size=num_prompts, + ) + output_lens = np.random.randint( + int(output_len * range_ratio), + output_len + 1, + size=num_prompts, + ) + offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) + input_requests = [] + for i in range(num_prompts): + prompt = tokenizer.decode( + [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])] + ) + input_requests.append((prompt, int(input_lens[i]), int(output_lens[i]))) + + print(f"#Input tokens: {np.sum(input_lens)}") + print(f"#Output tokens: {np.sum(output_lens)}") + return input_requests + + async def get_request( input_requests: List[Tuple[str, int, int]], request_rate: float, @@ -530,13 +561,23 @@ def fire(args: argparse.Namespace): tokenizer = get_tokenizer(tokenizer_id) - assert args.dataset is not None - input_requests = sample_sharegpt_requests( - dataset_path=args.dataset, - num_requests=args.num_prompts, - tokenizer=tokenizer, - fixed_output_len=args.sharegpt_output_len, - ) + if args.dataset_name == "sharegpt": + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + elif args.dataset_name == "random": + input_requests = sample_random_requests( + input_len=args.random_input_len, + output_len=args.random_output_len, + num_prompts=args.num_prompts, + range_ratio=args.random_range_ratio, + tokenizer=tokenizer, + ) + else: + raise ValueError(f"Unknown dataset: {args.dataset_name}") asyncio.run( benchmark( @@ -589,7 +630,14 @@ if __name__ == "__main__": help="If not set, the default port is configured according to its default value for different LLM Inference Engines.", ) parser.add_argument( - "--dataset", type=str, default="sharegpt", help="Path to the ShareGPT dataset" + "--dataset-name", + type=str, + default="sharegpt", + choices=["sharegpt", "random"], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument( + "--dataset-path", type=str, default="", help="Path to the dataset." ) parser.add_argument( "--model", @@ -613,10 +661,29 @@ if __name__ == "__main__": default=None, help="Output length for each request. Overrides the output length from the ShareGPT dataset.", ) + parser.add_argument( + "--random-input-len", + type=int, + default=1024, + help="Number of input tokens per request, used only for random dataset.", + ) + parser.add_argument( + "--random-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for random dataset.", + ) + parser.add_argument( + "--random-range-ratio", + type=float, + default=1.0, + help="Range of sampled ratio of input/output length, " + "used only for random dataset.", + ) parser.add_argument( "--request-rate", type=float, - default=128.0, + default=float("inf"), help="Number of requests per second. If this is inf, then all the requests are sent at time 0. " "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.", ) diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index dd576f7a6..01450d8ac 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -233,7 +233,7 @@ class ModelRunner: return logger.info(f"[gpu_id={self.gpu_id}] Capture cuda graph begin.") - batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 16)] + batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)] self.cuda_graph_runner = CudaGraphRunner( self, max_batch_size_to_capture=max(batch_size_list) ) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 2ba2095c7..9638e12ca 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -40,7 +40,7 @@ class GenerateReqInput: self.text is not None and self.input_ids is not None ): raise ValueError("Either text or input_ids should be provided.") - if "n" in self.sampling_params and self.sampling_params["n"] != 1: + if self.sampling_params.get("n", 1) != 1: is_single = False else: if self.text is not None: diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 0eabb480a..f6cc8677c 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -196,14 +196,14 @@ class TokenizerManager: event = asyncio.Event() state = ReqState([], False, event) self.rid_to_state[rid] = state - if is_prefill == False: + if is_prefill: + await self._wait_for_prefill_response(event, state, obj, request, rid) + yield input_ids + else: async for response in self._wait_for_response( event, state, obj, rid, request ): yield response - else: - await self._wait_for_prefill_response(event, state, obj, request, rid) - yield input_ids async def _handle_batch_request(self, obj, request): batch_size = obj.batch_size