### What this PR does / why we need it?
Since the newest vllm commit has deprecated the arg `--endpoint-type`,
we should use `--backend` instead
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
test it locally:
```shell
export VLLM_USE_MODELSCOPE=true
export DATASET_PATH=/root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
vllm serve Qwen/Qwen2.5-7B-Instruct --load-format dummy
wget -O ${DATASET_PATH} /root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench serve --model Qwen/Qwen2.5-7B-Instruct --backend vllm --dataset-name sharegpt --dataset-path ${DATASET_PATH} --num-prompt 200
```
and the result looks good:
```shell
============ Serving Benchmark Result ============
Successful requests: 200
Benchmark duration (s): 20.36
Total input tokens: 43560
Total generated tokens: 44697
Request throughput (req/s): 9.82
Output token throughput (tok/s): 2194.88
Peak output token throughput (tok/s): 4676.00
Peak concurrent requests: 200.00
Total Token throughput (tok/s): 4333.93
---------------Time to First Token----------------
Mean TTFT (ms): 2143.85
Median TTFT (ms): 2486.17
P99 TTFT (ms): 2530.36
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 43.50
Median TPOT (ms): 30.75
P99 TPOT (ms): 309.22
---------------Inter-token Latency----------------
Mean ITL (ms): 28.15
Median ITL (ms): 25.42
P99 ITL (ms): 38.30
==================================================
```
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
Signed-off-by: wangli <wangli858794774@gmail.com>
79 lines
1.8 KiB
JSON
79 lines
1.8 KiB
JSON
[
|
|
{
|
|
"test_name": "serving_qwen2_5vl_7B_tp1",
|
|
"qps_list": [
|
|
1,
|
|
4,
|
|
16,
|
|
"inf"
|
|
],
|
|
"server_parameters": {
|
|
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
|
|
"tensor_parallel_size": 1,
|
|
"swap_space": 16,
|
|
"disable_log_stats": "",
|
|
"disable_log_requests": "",
|
|
"trust_remote_code": "",
|
|
"max_model_len": 16384
|
|
},
|
|
"client_parameters": {
|
|
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
|
|
"backend": "openai-chat",
|
|
"dataset_name": "hf",
|
|
"hf_split": "train",
|
|
"endpoint": "/v1/chat/completions",
|
|
"dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
|
|
"num_prompts": 200,
|
|
"no_stream": ""
|
|
}
|
|
},
|
|
{
|
|
"test_name": "serving_qwen3_8B_tp1",
|
|
"qps_list": [
|
|
1,
|
|
4,
|
|
16,
|
|
"inf"
|
|
],
|
|
"server_parameters": {
|
|
"model": "Qwen/Qwen3-8B",
|
|
"tensor_parallel_size": 1,
|
|
"swap_space": 16,
|
|
"disable_log_stats": "",
|
|
"disable_log_requests": "",
|
|
"load_format": "dummy"
|
|
},
|
|
"client_parameters": {
|
|
"model": "Qwen/Qwen3-8B",
|
|
"backend": "vllm",
|
|
"dataset_name": "sharegpt",
|
|
"dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
"num_prompts": 200
|
|
}
|
|
},
|
|
{
|
|
"test_name": "serving_qwen2_5_7B_tp1",
|
|
"qps_list": [
|
|
1,
|
|
4,
|
|
16,
|
|
"inf"
|
|
],
|
|
"server_parameters": {
|
|
"model": "Qwen/Qwen2.5-7B-Instruct",
|
|
"tensor_parallel_size": 1,
|
|
"swap_space": 16,
|
|
"disable_log_stats": "",
|
|
"disable_log_requests": "",
|
|
"load_format": "dummy"
|
|
},
|
|
"client_parameters": {
|
|
"model": "Qwen/Qwen2.5-7B-Instruct",
|
|
"backend": "vllm",
|
|
"dataset_name": "sharegpt",
|
|
"dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
"num_prompts": 200
|
|
}
|
|
}
|
|
]
|