From 802c574532bf4cc6d46b6ee508e7d0f2b6e023a5 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Oct 2025 11:18:19 +0800 Subject: [PATCH] [Benchmark] Upgrade benchmark args for new vllm version (#3218) ### What this PR does / why we need it? Since the newest vllm commit has deprecated the arg `--endpoint-type`, we should use `--backend` instead ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? test it locally: ```shell export VLLM_USE_MODELSCOPE=true export DATASET_PATH=/root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json vllm serve Qwen/Qwen2.5-7B-Instruct --load-format dummy wget -O ${DATASET_PATH} /root/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json vllm bench serve --model Qwen/Qwen2.5-7B-Instruct --backend vllm --dataset-name sharegpt --dataset-path ${DATASET_PATH} --num-prompt 200 ``` and the result looks good: ```shell ============ Serving Benchmark Result ============ Successful requests: 200 Benchmark duration (s): 20.36 Total input tokens: 43560 Total generated tokens: 44697 Request throughput (req/s): 9.82 Output token throughput (tok/s): 2194.88 Peak output token throughput (tok/s): 4676.00 Peak concurrent requests: 200.00 Total Token throughput (tok/s): 4333.93 ---------------Time to First Token---------------- Mean TTFT (ms): 2143.85 Median TTFT (ms): 2486.17 P99 TTFT (ms): 2530.36 -----Time per Output Token (excl. 1st token)------ Mean TPOT (ms): 43.50 Median TPOT (ms): 30.75 P99 TPOT (ms): 309.22 ---------------Inter-token Latency---------------- Mean ITL (ms): 28.15 Median ITL (ms): 25.42 P99 ITL (ms): 38.30 ================================================== ``` - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangli --- benchmarks/tests/serving-tests.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index c2be9eb0..8eface3d 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -18,7 +18,7 @@ }, "client_parameters": { "model": "Qwen/Qwen2.5-VL-7B-Instruct", - "endpoint_type": "openai-chat", + "backend": "openai-chat", "dataset_name": "hf", "hf_split": "train", "endpoint": "/v1/chat/completions", @@ -45,7 +45,7 @@ }, "client_parameters": { "model": "Qwen/Qwen3-8B", - "endpoint_type": "vllm", + "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 @@ -69,7 +69,7 @@ }, "client_parameters": { "model": "Qwen/Qwen2.5-7B-Instruct", - "endpoint_type": "vllm", + "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200