From 802c574532bf4cc6d46b6ee508e7d0f2b6e023a5 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Fri, 24 Oct 2025 11:18:19 +0800
Subject: [PATCH] [Benchmark] Upgrade benchmark args for new vllm version
 (#3218)

### What this PR does / why we need it?
Since the newest vllm commit has deprecated the arg `--endpoint-type`,
we should use `--backend` instead
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
test it locally:
```shell
export VLLM_USE_MODELSCOPE=true
export DATASET_PATH=/root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json

vllm serve Qwen/Qwen2.5-7B-Instruct --load-format dummy

wget -O ${DATASET_PATH}  /root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

vllm bench serve --model Qwen/Qwen2.5-7B-Instruct --backend vllm --dataset-name sharegpt --dataset-path ${DATASET_PATH}  --num-prompt 200
```
and the result looks good:
```shell
============ Serving Benchmark Result ============
Successful requests:                     200
Benchmark duration (s):                  20.36
Total input tokens:                      43560
Total generated tokens:                  44697
Request throughput (req/s):              9.82
Output token throughput (tok/s):         2194.88
Peak output token throughput (tok/s):    4676.00
Peak concurrent requests:                200.00
Total Token throughput (tok/s):          4333.93
---------------Time to First Token----------------
Mean TTFT (ms):                          2143.85
Median TTFT (ms):                        2486.17
P99 TTFT (ms):                           2530.36
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          43.50
Median TPOT (ms):                        30.75
P99 TPOT (ms):                           309.22
---------------Inter-token Latency----------------
Mean ITL (ms):                           28.15
Median ITL (ms):                         25.42
P99 ITL (ms):                            38.30
==================================================
```
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 benchmarks/tests/serving-tests.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json
index c2be9eb0..8eface3d 100644
--- a/benchmarks/tests/serving-tests.json
+++ b/benchmarks/tests/serving-tests.json
@@ -18,7 +18,7 @@
     },
     "client_parameters": {
       "model": "Qwen/Qwen2.5-VL-7B-Instruct",
-      "endpoint_type": "openai-chat",
+      "backend": "openai-chat",
       "dataset_name": "hf",
       "hf_split": "train",
       "endpoint": "/v1/chat/completions",
@@ -45,7 +45,7 @@
     },
     "client_parameters": {
       "model": "Qwen/Qwen3-8B",
-      "endpoint_type": "vllm",
+      "backend": "vllm",
       "dataset_name": "sharegpt",
       "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
       "num_prompts": 200
@@ -69,7 +69,7 @@
     },
     "client_parameters": {
       "model": "Qwen/Qwen2.5-7B-Instruct",
-      "endpoint_type": "vllm",
+      "backend": "vllm",
       "dataset_name": "sharegpt",
       "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
       "num_prompts": 200