diff --git a/benchmark/latency_throughput/README.md b/benchmark/latency_throughput/README.md index af136e1d6..cabbb6ece 100644 --- a/benchmark/latency_throughput/README.md +++ b/benchmark/latency_throughput/README.md @@ -30,7 +30,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r #### Run ShareGPT ``` -python3 bench_throughput.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 +python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 ``` ## Other baselines @@ -42,14 +42,20 @@ python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --t ``` # run synthetic -python3 bench_throughput.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 +python3 bench_serving.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 ``` ``` # run ShareGPT -python3 bench_throughput.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 +python3 bench_serving.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 ``` +``` +# run one batch +python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B --tensor 8 --disable-log-requests --max-num-seqs 1024 --quantization fp8 + +python3 bench_one.py --input-len 1024 --batch-size 1 1 2 4 8 16 32 64 128 256 512 768 1024 --port 8000 --backend vllm +``` ### LightLLM ``` @@ -57,5 +63,5 @@ python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat ``` ``` -python3 bench_throughput.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 +python3 bench_serving.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 ``` \ No newline at end of file diff --git a/benchmark/latency_throughput/bench_one.py b/benchmark/latency_throughput/bench_one.py index cb3ec5a4e..b51508d3e 100644 --- a/benchmark/latency_throughput/bench_one.py +++ b/benchmark/latency_throughput/bench_one.py @@ -15,19 +15,19 @@ def run_one_batch_size(bs): url = f"{args.host}:{args.port}" max_new_tokens = args.max_tokens - a = 20 - prompt = f"{a, }" + if args.input_len: + input_ids = [ + [int(x) for x in np.random.randint(0, high=16384, size=(args.input_len,))] for _ in range(bs) + ] + else: + text = [f"{i, }" for i in range(bs)] tic = time.time() if args.backend == "srt": if args.input_len: - inputs = {"input_ids": [ - [int(x) for x in np.random.randint(0, high=16384, size=(args.input_len,))] for _ in range(bs) - ]} + inputs = {"input_ids": input_ids} else: - inputs = {"text": [ - f"{i, }" for i in range(bs) - ]} + inputs = {"text": text} response = requests.post( url + "/generate", @@ -44,7 +44,7 @@ def run_one_batch_size(bs): response = requests.post( url + "/generate", json={ - "inputs": prompt, + "inputs": text[0], "parameters": { "temperature": 0, "max_new_tokens": max_new_tokens, @@ -53,13 +53,19 @@ def run_one_batch_size(bs): }, ) elif args.backend == "vllm": + if args.input_len: + inputs = {"prompt": input_ids} + else: + inputs = {"prompt": text} + response = requests.post( - url + "/generate", + url + "/v1/completions", json={ - "prompt": prompt, + "model": args.vllm_model_name, "temperature": 0, "max_tokens": max_new_tokens, "ignore_eos": True, + **inputs, }, ) elif args.backend == "ginfer": @@ -71,7 +77,7 @@ def run_one_batch_size(bs): tic = time.time() sample_request = sampler_pb2.SampleTextRequest( - prompt=prompt, + prompt=text[0], settings=sampler_pb2.SampleSettings( max_len=max_new_tokens, rng_seed=0, @@ -92,7 +98,7 @@ def run_one_batch_size(bs): output_throughput = bs * max_new_tokens / latency print(f"latency: {latency:.2f} s, speed: {output_throughput:.2f} token/s") - with open("tmp_output.txt", "a") as fout: + with open("results.jsonl", "a") as fout: res = { "input_len": args.input_len, "output_len": args.max_tokens, @@ -111,6 +117,7 @@ if __name__ == "__main__": parser.add_argument("--input-len", type=int, default=None) parser.add_argument("--batch-size", type=int, nargs='*', default=[1]) parser.add_argument("--max-tokens", type=int, default=256) + parser.add_argument("--vllm-model-name", type=str, default="meta-llama/Meta-Llama-3-70B") args = parser.parse_args() if args.port is None: diff --git a/python/sglang/README.md b/python/sglang/README.md new file mode 100644 index 000000000..c8c093706 --- /dev/null +++ b/python/sglang/README.md @@ -0,0 +1,12 @@ +# Code Structure + +- `backend`: Various backends for the language interpreter. +- `lang`: The frontend language. +- `srt`: The runtime for running local models. +- `test`: Test utilities. +- `api.py`: Public API. +- `bench_latency.py`: Benchmark utilities. +- `global_config.py`: The global configs and constants. +- `launch_server.py`: The entry point of launching local server. +- `utils.py`: Common utilities. + diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index d7b66e76b..a24653661 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -276,17 +276,13 @@ class ModelRunner: input_metadata = InputMetadata.create( self, forward_mode=ForwardMode.EXTEND, - tp_size=self.tp_size, req_pool_indices=batch.req_pool_indices, seq_lens=batch.seq_lens, prefix_lens=batch.prefix_lens, position_ids_offsets=batch.position_ids_offsets, out_cache_loc=batch.out_cache_loc, - top_logprobs_nums=batch.top_logprobs_nums, return_logprob=batch.return_logprob, - flashinfer_prefill_wrapper_ragged=self.flashinfer_prefill_wrapper_ragged, - flashinfer_prefill_wrapper_paged=self.flashinfer_prefill_wrapper_paged, - flashinfer_decode_wrapper=self.flashinfer_decode_wrapper, + top_logprobs_nums=batch.top_logprobs_nums, ) return self.model.forward( batch.input_ids,