2025-12-27 09:16:08 +08:00
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
import logging
import os
import subprocess
from datetime import datetime
from . aisbench import maybe_download_from_modelscope
class VllmbenchRunner :
def _run_vllm_bench_task ( self ) :
vllm_bench_cmd = [
' vllm ' , ' bench ' , ' serve ' , ' --backend ' , ' openai-chat ' ,
' --trust-remote-code ' , ' --served-model-name ' ,
str ( self . model_name ) , ' --model ' , self . model_path , ' --tokenizer ' ,
self . model_path , ' --metric-percentiles ' , ' 50,90,99 ' , ' --host ' ,
self . host_ip , ' --port ' ,
str ( self . port ) , ' --save-result ' , ' --result-filename ' ,
self . result_filename , ' --endpoint ' , ' /v1/chat/completions ' ,
' --ready-check-timeout-sec ' , ' 0 '
]
self . _concat_config_args ( vllm_bench_cmd )
print ( f " running vllm_bench cmd: { ' ' . join ( vllm_bench_cmd ) } " )
self . proc : subprocess . Popen = subprocess . Popen ( vllm_bench_cmd ,
stdout = subprocess . PIPE ,
stderr = subprocess . PIPE ,
text = True )
def __init__ ( self ,
model_name : str ,
port : int ,
config : dict ,
2026-01-07 18:42:05 +08:00
baseline : float ,
threshold : float = 0.97 ,
2025-12-27 09:16:08 +08:00
model_path : str = " " ,
host_ip : str = " localhost " ) :
self . model_name = model_name
self . model_path = model_path
if not self . model_path :
self . model_path = maybe_download_from_modelscope ( model_name )
assert self . model_path is not None , \
f " Failed to download model: model= { self . model_path } "
self . port = port
self . host_ip = host_ip
curr_time = datetime . now ( ) . strftime ( ' % Y % m %d % H % M % S ' )
self . result_filename = f " result_vllm_bench_ { curr_time } .json "
self . config = config
2026-01-07 18:42:05 +08:00
self . baseline = baseline
self . threshold = threshold
2025-12-27 09:16:08 +08:00
self . _run_vllm_bench_task ( )
self . _wait_for_task ( )
2026-01-07 18:42:05 +08:00
self . _performance_verify ( )
2025-12-27 09:16:08 +08:00
def _concat_config_args ( self , vllm_bench_cmd ) :
if " ignore_eos " in self . config :
if self . config [ " ignore_eos " ] :
self . config [ " ignore_eos " ] = " "
else :
self . config . pop ( " ignore_eos " )
for key , value in self . config . items ( ) :
key = " -- " + key . replace ( " _ " , " - " )
vllm_bench_cmd + = [ key , str ( value ) ]
def __enter__ ( self ) :
return self
def __exit__ ( self , exc_type , exc_value , traceback ) :
self . proc . terminate ( )
try :
self . proc . wait ( 8 )
except subprocess . TimeoutExpired :
# force kill if needed
self . proc . kill ( )
def _wait_for_task ( self ) :
2026-01-07 18:42:05 +08:00
""" Wait for the vllm bench command to complete and check the execution result """
stdout , stderr = self . proc . communicate ( )
if self . proc . returncode != 0 :
logging . error (
f " vllm bench command failed, return code: { self . proc . returncode } "
)
logging . error ( f " Standard output: { stdout } " )
logging . error ( f " Standard error: { stderr } " )
raise RuntimeError (
f " vllm bench command execution failed: { stderr } " )
logging . info (
f " vllm bench command completed, return code: { self . proc . returncode } "
)
if stdout :
lines = stdout . split ( ' \n ' )
last_lines = lines [ - 100 : ] if len ( lines ) > 100 else lines
logging . info ( f " Last { len ( last_lines ) } lines of standard output: " )
for line in last_lines :
logging . info ( line )
else :
logging . info ( " Standard output is empty " )
2025-12-27 09:16:08 +08:00
def _get_result ( self ) :
result_file = os . path . join ( os . getcwd ( ) , self . result_filename )
print ( " Getting performance results from file: " , result_file )
with open ( result_file , ' r ' , encoding = ' utf-8 ' ) as f :
self . result = json . load ( f )
2026-01-07 18:42:05 +08:00
def _performance_verify ( self ) :
self . _get_result ( )
output_throughput = self . result [ " output_throughput " ]
assert float (
output_throughput
) > = self . baseline * self . threshold , f " Performance verification failed. The current Output Token Throughput is { output_throughput } token/s, which is not greater than or equal to { self . threshold } * baseline { self . baseline } . "
2025-12-27 09:16:08 +08:00
def run_vllm_bench_case ( model_name ,
port ,
config ,
2026-01-07 18:42:05 +08:00
baseline ,
threshold = 0.97 ,
2025-12-27 09:16:08 +08:00
model_path = " " ,
host_ip = " localhost " ) :
try :
with VllmbenchRunner ( model_name ,
port ,
config ,
2026-01-07 18:42:05 +08:00
baseline ,
threshold ,
2025-12-27 09:16:08 +08:00
model_path = model_path ,
host_ip = host_ip ) as vllm_bench :
vllm_bench_result = vllm_bench . result
except Exception as e :
print ( e )
error_msg = f " vllm_bench run failed, reason is { e } "
logging . error ( error_msg )
assert False , f " vllm_bench run failed, reason is { e } "
return vllm_bench_result