Improve benchmark scripts & fix llava (#613)
This commit is contained in:
@@ -30,7 +30,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
|
|
||||||
#### Run ShareGPT
|
#### Run ShareGPT
|
||||||
```
|
```
|
||||||
python3 bench_throughput.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
|
python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
|
||||||
```
|
```
|
||||||
|
|
||||||
## Other baselines
|
## Other baselines
|
||||||
@@ -42,14 +42,20 @@ python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --t
|
|||||||
|
|
||||||
```
|
```
|
||||||
# run synthetic
|
# run synthetic
|
||||||
python3 bench_throughput.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256
|
python3 bench_serving.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
# run ShareGPT
|
# run ShareGPT
|
||||||
python3 bench_throughput.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
|
python3 bench_serving.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
# run one batch
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B --tensor 8 --disable-log-requests --max-num-seqs 1024 --quantization fp8
|
||||||
|
|
||||||
|
python3 bench_one.py --input-len 1024 --batch-size 1 1 2 4 8 16 32 64 128 256 512 768 1024 --port 8000 --backend vllm
|
||||||
|
```
|
||||||
|
|
||||||
### LightLLM
|
### LightLLM
|
||||||
```
|
```
|
||||||
@@ -57,5 +63,5 @@ python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 bench_throughput.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
|
python3 bench_serving.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
|
||||||
```
|
```
|
||||||
@@ -15,19 +15,19 @@ def run_one_batch_size(bs):
|
|||||||
url = f"{args.host}:{args.port}"
|
url = f"{args.host}:{args.port}"
|
||||||
max_new_tokens = args.max_tokens
|
max_new_tokens = args.max_tokens
|
||||||
|
|
||||||
a = 20
|
if args.input_len:
|
||||||
prompt = f"{a, }"
|
input_ids = [
|
||||||
|
[int(x) for x in np.random.randint(0, high=16384, size=(args.input_len,))] for _ in range(bs)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
text = [f"{i, }" for i in range(bs)]
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
if args.backend == "srt":
|
if args.backend == "srt":
|
||||||
if args.input_len:
|
if args.input_len:
|
||||||
inputs = {"input_ids": [
|
inputs = {"input_ids": input_ids}
|
||||||
[int(x) for x in np.random.randint(0, high=16384, size=(args.input_len,))] for _ in range(bs)
|
|
||||||
]}
|
|
||||||
else:
|
else:
|
||||||
inputs = {"text": [
|
inputs = {"text": text}
|
||||||
f"{i, }" for i in range(bs)
|
|
||||||
]}
|
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url + "/generate",
|
url + "/generate",
|
||||||
@@ -44,7 +44,7 @@ def run_one_batch_size(bs):
|
|||||||
response = requests.post(
|
response = requests.post(
|
||||||
url + "/generate",
|
url + "/generate",
|
||||||
json={
|
json={
|
||||||
"inputs": prompt,
|
"inputs": text[0],
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"temperature": 0,
|
"temperature": 0,
|
||||||
"max_new_tokens": max_new_tokens,
|
"max_new_tokens": max_new_tokens,
|
||||||
@@ -53,13 +53,19 @@ def run_one_batch_size(bs):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
elif args.backend == "vllm":
|
elif args.backend == "vllm":
|
||||||
|
if args.input_len:
|
||||||
|
inputs = {"prompt": input_ids}
|
||||||
|
else:
|
||||||
|
inputs = {"prompt": text}
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url + "/generate",
|
url + "/v1/completions",
|
||||||
json={
|
json={
|
||||||
"prompt": prompt,
|
"model": args.vllm_model_name,
|
||||||
"temperature": 0,
|
"temperature": 0,
|
||||||
"max_tokens": max_new_tokens,
|
"max_tokens": max_new_tokens,
|
||||||
"ignore_eos": True,
|
"ignore_eos": True,
|
||||||
|
**inputs,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
elif args.backend == "ginfer":
|
elif args.backend == "ginfer":
|
||||||
@@ -71,7 +77,7 @@ def run_one_batch_size(bs):
|
|||||||
|
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
sample_request = sampler_pb2.SampleTextRequest(
|
sample_request = sampler_pb2.SampleTextRequest(
|
||||||
prompt=prompt,
|
prompt=text[0],
|
||||||
settings=sampler_pb2.SampleSettings(
|
settings=sampler_pb2.SampleSettings(
|
||||||
max_len=max_new_tokens,
|
max_len=max_new_tokens,
|
||||||
rng_seed=0,
|
rng_seed=0,
|
||||||
@@ -92,7 +98,7 @@ def run_one_batch_size(bs):
|
|||||||
output_throughput = bs * max_new_tokens / latency
|
output_throughput = bs * max_new_tokens / latency
|
||||||
print(f"latency: {latency:.2f} s, speed: {output_throughput:.2f} token/s")
|
print(f"latency: {latency:.2f} s, speed: {output_throughput:.2f} token/s")
|
||||||
|
|
||||||
with open("tmp_output.txt", "a") as fout:
|
with open("results.jsonl", "a") as fout:
|
||||||
res = {
|
res = {
|
||||||
"input_len": args.input_len,
|
"input_len": args.input_len,
|
||||||
"output_len": args.max_tokens,
|
"output_len": args.max_tokens,
|
||||||
@@ -111,6 +117,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--input-len", type=int, default=None)
|
parser.add_argument("--input-len", type=int, default=None)
|
||||||
parser.add_argument("--batch-size", type=int, nargs='*', default=[1])
|
parser.add_argument("--batch-size", type=int, nargs='*', default=[1])
|
||||||
parser.add_argument("--max-tokens", type=int, default=256)
|
parser.add_argument("--max-tokens", type=int, default=256)
|
||||||
|
parser.add_argument("--vllm-model-name", type=str, default="meta-llama/Meta-Llama-3-70B")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.port is None:
|
if args.port is None:
|
||||||
|
|||||||
12
python/sglang/README.md
Normal file
12
python/sglang/README.md
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Code Structure
|
||||||
|
|
||||||
|
- `backend`: Various backends for the language interpreter.
|
||||||
|
- `lang`: The frontend language.
|
||||||
|
- `srt`: The runtime for running local models.
|
||||||
|
- `test`: Test utilities.
|
||||||
|
- `api.py`: Public API.
|
||||||
|
- `bench_latency.py`: Benchmark utilities.
|
||||||
|
- `global_config.py`: The global configs and constants.
|
||||||
|
- `launch_server.py`: The entry point of launching local server.
|
||||||
|
- `utils.py`: Common utilities.
|
||||||
|
|
||||||
@@ -276,17 +276,13 @@ class ModelRunner:
|
|||||||
input_metadata = InputMetadata.create(
|
input_metadata = InputMetadata.create(
|
||||||
self,
|
self,
|
||||||
forward_mode=ForwardMode.EXTEND,
|
forward_mode=ForwardMode.EXTEND,
|
||||||
tp_size=self.tp_size,
|
|
||||||
req_pool_indices=batch.req_pool_indices,
|
req_pool_indices=batch.req_pool_indices,
|
||||||
seq_lens=batch.seq_lens,
|
seq_lens=batch.seq_lens,
|
||||||
prefix_lens=batch.prefix_lens,
|
prefix_lens=batch.prefix_lens,
|
||||||
position_ids_offsets=batch.position_ids_offsets,
|
position_ids_offsets=batch.position_ids_offsets,
|
||||||
out_cache_loc=batch.out_cache_loc,
|
out_cache_loc=batch.out_cache_loc,
|
||||||
top_logprobs_nums=batch.top_logprobs_nums,
|
|
||||||
return_logprob=batch.return_logprob,
|
return_logprob=batch.return_logprob,
|
||||||
flashinfer_prefill_wrapper_ragged=self.flashinfer_prefill_wrapper_ragged,
|
top_logprobs_nums=batch.top_logprobs_nums,
|
||||||
flashinfer_prefill_wrapper_paged=self.flashinfer_prefill_wrapper_paged,
|
|
||||||
flashinfer_decode_wrapper=self.flashinfer_decode_wrapper,
|
|
||||||
)
|
)
|
||||||
return self.model.forward(
|
return self.model.forward(
|
||||||
batch.input_ids,
|
batch.input_ids,
|
||||||
|
|||||||
Reference in New Issue
Block a user