2025-10-20 09:33:17 +08:00
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
2025-10-28 20:40:03 +08:00
import hashlib
2025-10-20 09:33:17 +08:00
import json
2025-10-28 23:33:15 +08:00
import logging
2025-10-20 09:33:17 +08:00
import os
import re
import subprocess
2025-10-28 20:40:03 +08:00
import tempfile
from pathlib import Path
2025-10-20 09:33:17 +08:00
2025-10-28 20:40:03 +08:00
import filelock
import huggingface_hub
2025-10-20 09:33:17 +08:00
import pandas as pd
from modelscope import snapshot_download # type: ignore
2025-12-27 09:16:08 +08:00
BENCHMARK_HOME = os . getenv ( " BENCHMARK_HOME " , os . path . abspath ( " ./benchmark " ) )
2025-11-13 20:10:12 +08:00
DATASET_CONF_DIR = os . path . join ( BENCHMARK_HOME , " ais_bench " , " benchmark " ,
" configs " , " datasets " )
REQUEST_CONF_DIR = os . path . join ( BENCHMARK_HOME , " ais_bench " , " benchmark " ,
" configs " , " models " , " vllm_api " )
DATASET_DIR = os . path . join ( BENCHMARK_HOME , " ais_bench " , " datasets " )
2025-10-20 09:33:17 +08:00
class AisbenchRunner :
RESULT_MSG = {
" performance " : " Performance Result files locate in " ,
" accuracy " : " write csv to "
}
DATASET_RENAME = {
" aime2024 " : " aime " ,
" gsm8k-lite " : " gsm8k " ,
" textvqa-lite " : " textvqa "
}
def _run_aisbench_task ( self ) :
dataset_conf = self . dataset_conf . split ( ' / ' ) [ - 1 ]
if self . task_type == " accuracy " :
aisbench_cmd = [
' ais_bench ' , ' --models ' , f ' { self . request_conf } _custom ' ,
2025-10-23 17:18:49 +08:00
' --datasets ' , f ' { dataset_conf } '
2025-10-20 09:33:17 +08:00
]
if self . task_type == " performance " :
aisbench_cmd = [
' ais_bench ' , ' --models ' , f ' { self . request_conf } _custom ' ,
2025-10-23 17:18:49 +08:00
' --datasets ' , f ' { dataset_conf } _custom ' , ' --mode ' , ' perf '
2025-10-20 09:33:17 +08:00
]
if self . num_prompts :
aisbench_cmd . extend ( [ ' --num-prompts ' , str ( self . num_prompts ) ] )
print ( f " running aisbench cmd: { ' ' . join ( aisbench_cmd ) } " )
self . proc : subprocess . Popen = subprocess . Popen ( aisbench_cmd ,
stdout = subprocess . PIPE ,
stderr = subprocess . PIPE ,
text = True )
def __init__ ( self ,
model : str ,
port : int ,
aisbench_config : dict ,
2025-10-30 23:42:20 +08:00
host_ip : str = " localhost " ,
2025-10-20 09:33:17 +08:00
verify = True ) :
2025-10-23 17:18:49 +08:00
self . model = model
2025-12-27 09:16:08 +08:00
self . dataset_path = aisbench_config . get ( " dataset_path_local " )
if not self . dataset_path :
self . dataset_path = maybe_download_from_modelscope (
aisbench_config [ " dataset_path " ] , repo_type = " dataset " )
self . model_path = aisbench_config . get ( " model_path " )
if not self . model_path :
self . model_path = maybe_download_from_modelscope ( model )
2025-10-28 20:40:03 +08:00
assert self . dataset_path is not None and self . model_path is not None , \
f " Failed to download dataset or model: dataset= { self . dataset_path } , model= { self . model_path } "
2025-10-23 17:18:49 +08:00
self . port = port
2025-10-30 23:42:20 +08:00
self . host_ip = host_ip
2025-10-20 09:33:17 +08:00
self . task_type = aisbench_config [ " case_type " ]
self . request_conf = aisbench_config [ " request_conf " ]
self . dataset_conf = aisbench_config . get ( " dataset_conf " )
self . num_prompts = aisbench_config . get ( " num_prompts " )
self . max_out_len = aisbench_config [ " max_out_len " ]
self . batch_size = aisbench_config [ " batch_size " ]
self . request_rate = aisbench_config . get ( " request_rate " , 0 )
[CI]Add Kimi k2 nightly test (#5682)
### What this PR does / why we need it?
The PR add performance and accuracy tests for **Kimi-K2-Instruct-W8A8**
and **Kimi-K2-Thinking** models to the Nightly test suite.
#### Test Configuration
**Kimi-K2-Instruct-W8A8**
- model: vllm-ascend/Kimi-K2-Instruct-W8A8
- Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node)
- Architecture: Unified Distributed Inference
- Parallelism: **DP4 + TP8 + EP** (Data Parallel 4, Tensor Parallel 8,
Expert Parallel enabled).
- Optimization: **torchair graph**, **no-prefix-caching**.
- Node 0: DP Rank 0-1, Local DP 2, Tensor Parallel 8.
- Node 1: DP Rank 2-3, Local DP 2, Tensor Parallel 8.
- Benchmarks:
- Performance: vllm-ascend/GSM8K-in3500-bs2800.
- Accuracy: vllm-ascend/gsm8k-lite.
**Kimi-K2-Thinking**
- Model: moonshotai/Kimi-K2-Thinking
- Hardware: A3, 1 Node (16 NPUs total)
- Architecture: Single Node Distributed Inference
- Parallelism: TP16 + EP (Tensor Parallel 16, Expert Parallel enabled).
- Optimization: **no-prefix-caching**
- Benchmarks:
- Performance: vllm-ascend/GSM8K-in3500-bs400.
- Accuracy: vllm-ascend/gsm8k-lite.
### Does this PR introduce _any_ user-facing change?
**Yes.** This PR enhances the ```AisbenchRunner``` to support dynamic
configuration of the ```trust_remote_code``` flag. This allows the
AISBench client to successfully load tokenizers for models that require
custom code execution (e.g., **Kimi-K2-Thinking and
Kimi-K2-Instruct-W8A8**).
**Changes:**
1. ```AisbenchRunner.__init__ ```Added the ability to capture the
```trust_remote_code``` parameter from the case configuration.
``` python
self.batch_size = aisbench_config["batch_size"]
self.request_rate = aisbench_config.get("request_rate", 0)
+ self.trust_remote_code = aisbench_config.get("trust_remote_code", False)
self.temperature = aisbench_config.get("temperature")
self.top_k = aisbench_config.get("top_k")
```
2. ```AisbenchRunner._init_request_conf``` Added regex substitution to
inject the parameter into the generated dynamic configuration file.
``` python
content = re.sub(r'batch_size.*', f'batch_size = {self.batch_size},',
content)
+ content = re.sub(r'trust_remote_code=.*',
+ f'trust_remote_code={self.trust_remote_code},',
+ content)
content = content.replace("top_k", "#top_k")
content = content.replace("seed", "#seed")
```
**Details:**
- New Config Key: Users can add ```"trust_remote_code": True``` to any
dictionary within the ```aisbench_cases``` list.
- Default Value: Defaults to ```False``` to maintain existing security
protocols for standard models.
- Impact: Resolves ```ValueError``` when benchmarking reasoning models
or models with custom tokenizers that previously failed during the
AISBench local initialization phase.
**User Example:**
Users can now enable custom code execution for specific models (like
Kimi-K2-Thinking) directly in their test suite:
```
# Now supported in test scripts:
aisbench_cases = [{
"case_type": "performance",
"request_conf": "vllm_api_stream_chat",
"trust_remote_code": True, # New user-facing parameter
...
}]
```
### How was this patch tested?
Actions:
- https://github.com/vllm-project/vllm-ascend/actions/runs/20849768433
Result as following:
- **Kimi-K2-Instruct-W8A8**(25m25s)
1. Accuracy test
```
dataset version metric mode vllm-api-general-chat
--------- --------- -------- ------ -----------------------
gsm8k 7cd45e accuracy gen 96.88
```
2. Perf test
```
╒══════════════════════════╤═════════╤════════════════╤════════════════╤═══════════════╤════════════════╤════════════════╤════════════════╤════════════════╤═════╕
│ Performance Parameters │ Stage │ Average │ Min │ Max │ Median │ P75 │ P90 │ P99 │ N │
╞══════════════════════════╪═════════╪════════════════╪════════════════╪═══════════════╪════════════════╪════════════════╪════════════════╪════════════════╪═════╡
│ E2EL │ total │ 34571.489 ms │ 28657.8054 ms │ 36294.1788 ms │ 34714.7329 ms │ 35247.2724 ms │ 35526.6758 ms │ 36146.4314 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TTFT │ total │ 2043.9136 ms │ 627.4718 ms │ 3532.3978 ms │ 1906.0194 ms │ 2307.7979 ms │ 2883.8528 ms │ 3283.7012 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TPOT │ total │ 127.5591 ms │ 106.4937 ms │ 137.107 ms │ 128.3135 ms │ 129.5704 ms │ 131.1332 ms │ 134.1087 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ ITL │ total │ 126.5571 ms │ 0.0095 ms │ 1340.783 ms │ 104.1398 ms │ 110.1272 ms │ 119.6124 ms │ 950.2924 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ InputTokens │ total │ 3516.6055 │ 3014.0 │ 3985.0 │ 3525.0 │ 3525.0 │ 3586.8 │ 3800.67 │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokens │ total │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokenThroughput │ total │ 7.4143 token/s │ 7.0535 token/s │ 8.933 token/s │ 7.3744 token/s │ 7.4118 token/s │ 7.5608 token/s │ 8.7051 token/s │ 512 │
╘══════════════════════════╧═════════╧════════════════╧════════════════╧═══════════════╧════════════════╧════════════════╧════════════════╧════════════════╧═════╛
╒══════════════════════════╤═════════╤═══════════════════╕
│ Common Metric │ Stage │ Value │
╞══════════════════════════╪═════════╪═══════════════════╡
│ Benchmark Duration │ total │ 279430.9375 ms │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Requests │ total │ 512 │
├──────────────────────────┼─────────┼───────────────────┤
│ Failed Requests │ total │ 0 │
├──────────────────────────┼─────────┼───────────────────┤
│ Success Requests │ total │ 512 │
├──────────────────────────┼─────────┼───────────────────┤
│ Concurrency │ total │ 63.3452 │
├──────────────────────────┼─────────┼───────────────────┤
│ Max Concurrency │ total │ 64 │
├──────────────────────────┼─────────┼───────────────────┤
│ Request Throughput │ total │ 1.8323 req/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Input Tokens │ total │ 1800502 │
├──────────────────────────┼─────────┼───────────────────┤
│ Prefill Token Throughput │ total │ 1720.5255 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total generated tokens │ total │ 131072 │
├──────────────────────────┼─────────┼───────────────────┤
│ Input Token Throughput │ total │ 6443.4598 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Output Token Throughput │ total │ 469.0676 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Token Throughput │ total │ 6912.5274 token/s │
╘══════════════════════════╧═════════╧═══════════════════╛
```
- **Kimi-K2-Thinking**(43m51s)
1. Accuracy test
```
dataset version metric mode vllm-api-general-chat
--------- --------- -------- ------ -----------------------
gsm8k 7cd45e accuracy gen 100.00
```
2. Perf test
```
╒══════════════════════════╤═════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤═════╕
│ Performance Parameters │ Stage │ Average │ Min │ Max │ Median │ P75 │ P90 │ P99 │ N │
╞══════════════════════════╪═════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪═════╡
│ E2EL │ total │ 172384.3573 ms │ 34456.5517 ms │ 205922.9407 ms │ 174844.2216 ms │ 202656.092 ms │ 204428.9502 ms │ 205468.6776 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TTFT │ total │ 138740.3228 ms │ 655.1066 ms │ 171777.3003 ms │ 141088.0561 ms │ 169237.5599 ms │ 170716.4954 ms │ 171393.1278 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TPOT │ total │ 131.9374 ms │ 90.6331 ms │ 135.4144 ms │ 132.405 ms │ 132.948 ms │ 133.7549 ms │ 135.2543 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ ITL │ total │ 130.9028 ms │ 0.0099 ms │ 960.3683 ms │ 116.9623 ms │ 122.3127 ms │ 132.0522 ms │ 886.4662 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ InputTokens │ total │ 3514.575 │ 3014.0 │ 3843.0 │ 3525.0 │ 3525.0 │ 3588.0 │ 3801.08 │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokens │ total │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokenThroughput │ total │ 1.6799 token/s │ 1.2432 token/s │ 7.4296 token/s │ 1.4642 token/s │ 1.4737 token/s │ 1.8754 token/s │ 7.125 token/s │ 400 │
╘══════════════════════════╧═════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧═════╛
╒══════════════════════════╤═════════╤═══════════════════╕
│ Common Metric │ Stage │ Value │
╞══════════════════════════╪═════════╪═══════════════════╡
│ Benchmark Duration │ total │ 1166795.568 ms │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Requests │ total │ 400 │
├──────────────────────────┼─────────┼───────────────────┤
│ Failed Requests │ total │ 0 │
├──────────────────────────┼─────────┼───────────────────┤
│ Success Requests │ total │ 400 │
├──────────────────────────┼─────────┼───────────────────┤
│ Concurrency │ total │ 59.0967 │
├──────────────────────────┼─────────┼───────────────────┤
│ Max Concurrency │ total │ 64 │
├──────────────────────────┼─────────┼───────────────────┤
│ Request Throughput │ total │ 0.3428 req/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Input Tokens │ total │ 1405830 │
├──────────────────────────┼─────────┼───────────────────┤
│ Prefill Token Throughput │ total │ 25.332 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total generated tokens │ total │ 102400 │
├──────────────────────────┼─────────┼───────────────────┤
│ Input Token Throughput │ total │ 1204.864 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Output Token Throughput │ total │ 87.7617 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Token Throughput │ total │ 1292.6258 token/s │
╘══════════════════════════╧═════════╧═══════════════════╛
```
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: root <root@LAPTOP-VQKDDVMG.localdomain>
2026-01-12 15:56:07 +08:00
self . trust_remote_code = aisbench_config . get ( " trust_remote_code " ,
False )
2025-10-23 17:18:49 +08:00
self . temperature = aisbench_config . get ( " temperature " )
self . top_k = aisbench_config . get ( " top_k " )
self . top_p = aisbench_config . get ( " top_p " )
self . seed = aisbench_config . get ( " seed " )
self . repetition_penalty = aisbench_config . get ( " repetition_penalty " )
2025-10-20 09:33:17 +08:00
self . exp_folder = None
2025-10-23 17:18:49 +08:00
self . result_line = None
2025-10-20 09:33:17 +08:00
self . _init_dataset_conf ( )
self . _init_request_conf ( )
self . _run_aisbench_task ( )
self . _wait_for_task ( )
if verify :
self . baseline = aisbench_config . get ( " baseline " , 1 )
if self . task_type == " accuracy " :
self . threshold = aisbench_config . get ( " threshold " , 1 )
self . _accuracy_verify ( )
if self . task_type == " performance " :
self . threshold = aisbench_config . get ( " threshold " , 0.97 )
self . _performance_verify ( )
def _init_dataset_conf ( self ) :
if self . task_type == " accuracy " :
dataset_name = os . path . basename ( self . dataset_path )
dataset_rename = self . DATASET_RENAME . get ( dataset_name , " " )
dst_dir = os . path . join ( DATASET_DIR , dataset_rename )
command = [ " cp " , " -r " , self . dataset_path , dst_dir ]
subprocess . call ( command )
if self . task_type == " performance " :
conf_path = os . path . join ( DATASET_CONF_DIR ,
f ' { self . dataset_conf } .py ' )
2025-10-21 17:34:48 +08:00
if self . dataset_conf . startswith ( " textvqa " ) :
self . dataset_path = os . path . join ( self . dataset_path ,
" textvqa_val.jsonl " )
2025-10-20 09:33:17 +08:00
with open ( conf_path , ' r ' , encoding = ' utf-8 ' ) as f :
content = f . read ( )
content = re . sub ( r ' path=.* ' , f ' path= " { self . dataset_path } " , ' ,
content )
conf_path_new = os . path . join ( DATASET_CONF_DIR ,
f ' { self . dataset_conf } _custom.py ' )
with open ( conf_path_new , ' w ' , encoding = ' utf-8 ' ) as f :
f . write ( content )
def _init_request_conf ( self ) :
conf_path = os . path . join ( REQUEST_CONF_DIR , f ' { self . request_conf } .py ' )
with open ( conf_path , ' r ' , encoding = ' utf-8 ' ) as f :
content = f . read ( )
content = re . sub ( r ' model=.* ' , f ' model= " { self . model } " , ' , content )
content = re . sub ( r ' host_port.* ' , f ' host_port = { self . port } , ' , content )
2025-10-30 23:42:20 +08:00
content = re . sub ( r ' host_ip.* ' , f ' host_ip = " { self . host_ip } " , ' , content )
2025-10-20 09:33:17 +08:00
content = re . sub ( r ' max_out_len.* ' ,
f ' max_out_len = { self . max_out_len } , ' , content )
content = re . sub ( r ' batch_size.* ' , f ' batch_size = { self . batch_size } , ' ,
content )
[CI]Add Kimi k2 nightly test (#5682)
### What this PR does / why we need it?
The PR add performance and accuracy tests for **Kimi-K2-Instruct-W8A8**
and **Kimi-K2-Thinking** models to the Nightly test suite.
#### Test Configuration
**Kimi-K2-Instruct-W8A8**
- model: vllm-ascend/Kimi-K2-Instruct-W8A8
- Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node)
- Architecture: Unified Distributed Inference
- Parallelism: **DP4 + TP8 + EP** (Data Parallel 4, Tensor Parallel 8,
Expert Parallel enabled).
- Optimization: **torchair graph**, **no-prefix-caching**.
- Node 0: DP Rank 0-1, Local DP 2, Tensor Parallel 8.
- Node 1: DP Rank 2-3, Local DP 2, Tensor Parallel 8.
- Benchmarks:
- Performance: vllm-ascend/GSM8K-in3500-bs2800.
- Accuracy: vllm-ascend/gsm8k-lite.
**Kimi-K2-Thinking**
- Model: moonshotai/Kimi-K2-Thinking
- Hardware: A3, 1 Node (16 NPUs total)
- Architecture: Single Node Distributed Inference
- Parallelism: TP16 + EP (Tensor Parallel 16, Expert Parallel enabled).
- Optimization: **no-prefix-caching**
- Benchmarks:
- Performance: vllm-ascend/GSM8K-in3500-bs400.
- Accuracy: vllm-ascend/gsm8k-lite.
### Does this PR introduce _any_ user-facing change?
**Yes.** This PR enhances the ```AisbenchRunner``` to support dynamic
configuration of the ```trust_remote_code``` flag. This allows the
AISBench client to successfully load tokenizers for models that require
custom code execution (e.g., **Kimi-K2-Thinking and
Kimi-K2-Instruct-W8A8**).
**Changes:**
1. ```AisbenchRunner.__init__ ```Added the ability to capture the
```trust_remote_code``` parameter from the case configuration.
``` python
self.batch_size = aisbench_config["batch_size"]
self.request_rate = aisbench_config.get("request_rate", 0)
+ self.trust_remote_code = aisbench_config.get("trust_remote_code", False)
self.temperature = aisbench_config.get("temperature")
self.top_k = aisbench_config.get("top_k")
```
2. ```AisbenchRunner._init_request_conf``` Added regex substitution to
inject the parameter into the generated dynamic configuration file.
``` python
content = re.sub(r'batch_size.*', f'batch_size = {self.batch_size},',
content)
+ content = re.sub(r'trust_remote_code=.*',
+ f'trust_remote_code={self.trust_remote_code},',
+ content)
content = content.replace("top_k", "#top_k")
content = content.replace("seed", "#seed")
```
**Details:**
- New Config Key: Users can add ```"trust_remote_code": True``` to any
dictionary within the ```aisbench_cases``` list.
- Default Value: Defaults to ```False``` to maintain existing security
protocols for standard models.
- Impact: Resolves ```ValueError``` when benchmarking reasoning models
or models with custom tokenizers that previously failed during the
AISBench local initialization phase.
**User Example:**
Users can now enable custom code execution for specific models (like
Kimi-K2-Thinking) directly in their test suite:
```
# Now supported in test scripts:
aisbench_cases = [{
"case_type": "performance",
"request_conf": "vllm_api_stream_chat",
"trust_remote_code": True, # New user-facing parameter
...
}]
```
### How was this patch tested?
Actions:
- https://github.com/vllm-project/vllm-ascend/actions/runs/20849768433
Result as following:
- **Kimi-K2-Instruct-W8A8**(25m25s)
1. Accuracy test
```
dataset version metric mode vllm-api-general-chat
--------- --------- -------- ------ -----------------------
gsm8k 7cd45e accuracy gen 96.88
```
2. Perf test
```
╒══════════════════════════╤═════════╤════════════════╤════════════════╤═══════════════╤════════════════╤════════════════╤════════════════╤════════════════╤═════╕
│ Performance Parameters │ Stage │ Average │ Min │ Max │ Median │ P75 │ P90 │ P99 │ N │
╞══════════════════════════╪═════════╪════════════════╪════════════════╪═══════════════╪════════════════╪════════════════╪════════════════╪════════════════╪═════╡
│ E2EL │ total │ 34571.489 ms │ 28657.8054 ms │ 36294.1788 ms │ 34714.7329 ms │ 35247.2724 ms │ 35526.6758 ms │ 36146.4314 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TTFT │ total │ 2043.9136 ms │ 627.4718 ms │ 3532.3978 ms │ 1906.0194 ms │ 2307.7979 ms │ 2883.8528 ms │ 3283.7012 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TPOT │ total │ 127.5591 ms │ 106.4937 ms │ 137.107 ms │ 128.3135 ms │ 129.5704 ms │ 131.1332 ms │ 134.1087 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ ITL │ total │ 126.5571 ms │ 0.0095 ms │ 1340.783 ms │ 104.1398 ms │ 110.1272 ms │ 119.6124 ms │ 950.2924 ms │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ InputTokens │ total │ 3516.6055 │ 3014.0 │ 3985.0 │ 3525.0 │ 3525.0 │ 3586.8 │ 3800.67 │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokens │ total │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 512 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼───────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokenThroughput │ total │ 7.4143 token/s │ 7.0535 token/s │ 8.933 token/s │ 7.3744 token/s │ 7.4118 token/s │ 7.5608 token/s │ 8.7051 token/s │ 512 │
╘══════════════════════════╧═════════╧════════════════╧════════════════╧═══════════════╧════════════════╧════════════════╧════════════════╧════════════════╧═════╛
╒══════════════════════════╤═════════╤═══════════════════╕
│ Common Metric │ Stage │ Value │
╞══════════════════════════╪═════════╪═══════════════════╡
│ Benchmark Duration │ total │ 279430.9375 ms │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Requests │ total │ 512 │
├──────────────────────────┼─────────┼───────────────────┤
│ Failed Requests │ total │ 0 │
├──────────────────────────┼─────────┼───────────────────┤
│ Success Requests │ total │ 512 │
├──────────────────────────┼─────────┼───────────────────┤
│ Concurrency │ total │ 63.3452 │
├──────────────────────────┼─────────┼───────────────────┤
│ Max Concurrency │ total │ 64 │
├──────────────────────────┼─────────┼───────────────────┤
│ Request Throughput │ total │ 1.8323 req/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Input Tokens │ total │ 1800502 │
├──────────────────────────┼─────────┼───────────────────┤
│ Prefill Token Throughput │ total │ 1720.5255 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total generated tokens │ total │ 131072 │
├──────────────────────────┼─────────┼───────────────────┤
│ Input Token Throughput │ total │ 6443.4598 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Output Token Throughput │ total │ 469.0676 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Token Throughput │ total │ 6912.5274 token/s │
╘══════════════════════════╧═════════╧═══════════════════╛
```
- **Kimi-K2-Thinking**(43m51s)
1. Accuracy test
```
dataset version metric mode vllm-api-general-chat
--------- --------- -------- ------ -----------------------
gsm8k 7cd45e accuracy gen 100.00
```
2. Perf test
```
╒══════════════════════════╤═════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤═════╕
│ Performance Parameters │ Stage │ Average │ Min │ Max │ Median │ P75 │ P90 │ P99 │ N │
╞══════════════════════════╪═════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪═════╡
│ E2EL │ total │ 172384.3573 ms │ 34456.5517 ms │ 205922.9407 ms │ 174844.2216 ms │ 202656.092 ms │ 204428.9502 ms │ 205468.6776 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TTFT │ total │ 138740.3228 ms │ 655.1066 ms │ 171777.3003 ms │ 141088.0561 ms │ 169237.5599 ms │ 170716.4954 ms │ 171393.1278 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TPOT │ total │ 131.9374 ms │ 90.6331 ms │ 135.4144 ms │ 132.405 ms │ 132.948 ms │ 133.7549 ms │ 135.2543 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ ITL │ total │ 130.9028 ms │ 0.0099 ms │ 960.3683 ms │ 116.9623 ms │ 122.3127 ms │ 132.0522 ms │ 886.4662 ms │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ InputTokens │ total │ 3514.575 │ 3014.0 │ 3843.0 │ 3525.0 │ 3525.0 │ 3588.0 │ 3801.08 │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokens │ total │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 256.0 │ 400 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokenThroughput │ total │ 1.6799 token/s │ 1.2432 token/s │ 7.4296 token/s │ 1.4642 token/s │ 1.4737 token/s │ 1.8754 token/s │ 7.125 token/s │ 400 │
╘══════════════════════════╧═════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧═════╛
╒══════════════════════════╤═════════╤═══════════════════╕
│ Common Metric │ Stage │ Value │
╞══════════════════════════╪═════════╪═══════════════════╡
│ Benchmark Duration │ total │ 1166795.568 ms │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Requests │ total │ 400 │
├──────────────────────────┼─────────┼───────────────────┤
│ Failed Requests │ total │ 0 │
├──────────────────────────┼─────────┼───────────────────┤
│ Success Requests │ total │ 400 │
├──────────────────────────┼─────────┼───────────────────┤
│ Concurrency │ total │ 59.0967 │
├──────────────────────────┼─────────┼───────────────────┤
│ Max Concurrency │ total │ 64 │
├──────────────────────────┼─────────┼───────────────────┤
│ Request Throughput │ total │ 0.3428 req/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Input Tokens │ total │ 1405830 │
├──────────────────────────┼─────────┼───────────────────┤
│ Prefill Token Throughput │ total │ 25.332 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total generated tokens │ total │ 102400 │
├──────────────────────────┼─────────┼───────────────────┤
│ Input Token Throughput │ total │ 1204.864 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Output Token Throughput │ total │ 87.7617 token/s │
├──────────────────────────┼─────────┼───────────────────┤
│ Total Token Throughput │ total │ 1292.6258 token/s │
╘══════════════════════════╧═════════╧═══════════════════╛
```
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: root <root@LAPTOP-VQKDDVMG.localdomain>
2026-01-12 15:56:07 +08:00
content = re . sub ( r ' trust_remote_code=.* ' ,
f ' trust_remote_code= { self . trust_remote_code } , ' ,
content )
2025-10-20 09:33:17 +08:00
content = content . replace ( " top_k " , " #top_k " )
content = content . replace ( " seed " , " #seed " )
content = content . replace ( " repetition_penalty " , " #repetition_penalty " )
if self . task_type == " performance " :
content = re . sub ( r ' path=.* ' , f ' path= " { self . model_path } " , ' , content )
content = re . sub ( r ' request_rate.* ' ,
f ' request_rate = { self . request_rate } , ' , content )
content = re . sub (
r " temperature.* " ,
" temperature = 0, \n ignore_eos = True, " , content )
content = content . replace ( " top_p " , " #top_p " )
if self . task_type == " accuracy " :
content = re . sub (
r " temperature.* " ,
" temperature = 0.6, \n ignore_eos = False, " , content )
2025-10-23 17:18:49 +08:00
if self . temperature :
content = re . sub ( r " temperature.* " ,
2025-10-24 17:12:06 +08:00
f " temperature = { self . temperature } , " , content )
2025-10-23 17:18:49 +08:00
if self . top_p :
2025-10-24 17:12:06 +08:00
content = re . sub ( r " #?top_p.* " , f " top_p = { self . top_p } , " , content )
2025-10-23 17:18:49 +08:00
if self . top_k :
2025-10-24 17:12:06 +08:00
content = re . sub ( r " #top_k.* " , f " top_k = { self . top_k } , " , content )
2025-10-23 17:18:49 +08:00
if self . seed :
2025-10-24 17:12:06 +08:00
content = re . sub ( r " #seed.* " , f " seed = { self . seed } , " , content )
2025-10-23 17:18:49 +08:00
if self . repetition_penalty :
content = re . sub (
r " #repetition_penalty.* " ,
2025-10-24 17:12:06 +08:00
f " repetition_penalty = { self . repetition_penalty } , " , content )
2025-10-20 09:33:17 +08:00
conf_path_new = os . path . join ( REQUEST_CONF_DIR ,
f ' { self . request_conf } _custom.py ' )
with open ( conf_path_new , ' w ' , encoding = ' utf-8 ' ) as f :
f . write ( content )
print ( f " The request config is \n { content } " )
def __enter__ ( self ) :
return self
def __exit__ ( self , exc_type , exc_value , traceback ) :
self . proc . terminate ( )
try :
self . proc . wait ( 8 )
except subprocess . TimeoutExpired :
# force kill if needed
self . proc . kill ( )
def _wait_for_exp_folder ( self ) :
while True :
line = self . proc . stdout . readline ( ) . strip ( )
print ( line )
if " Current exp folder: " in line :
self . exp_folder = re . search ( r ' Current exp folder: (.*) ' ,
line ) . group ( 1 )
return
if " ERROR " in line :
2025-10-28 23:33:15 +08:00
error_msg = f " Some errors happened to Aisbench runtime, the first error is { line } "
raise RuntimeError ( error_msg ) from None
2025-10-20 09:33:17 +08:00
def _wait_for_task ( self ) :
self . _wait_for_exp_folder ( )
result_msg = self . RESULT_MSG [ self . task_type ]
while True :
line = self . proc . stdout . readline ( ) . strip ( )
print ( line )
if result_msg in line :
self . result_line = line
return
if " ERROR " in line :
2025-10-28 23:33:15 +08:00
error_msg = f " Some errors happened to Aisbench runtime, the first error is { line } "
raise RuntimeError ( error_msg ) from None
2025-10-20 09:33:17 +08:00
def _get_result_performance ( self ) :
result_dir = re . search ( r ' Performance Result files locate in (.*) ' ,
self . result_line ) . group ( 1 ) [ : - 1 ]
2025-10-21 17:34:48 +08:00
dataset_type = self . dataset_conf . split ( ' / ' ) [ 0 ]
result_csv_file = os . path . join ( result_dir ,
f " { dataset_type } dataset.csv " )
result_json_file = os . path . join ( result_dir ,
f " { dataset_type } dataset.json " )
2025-10-24 16:33:18 +08:00
self . result_csv = pd . read_csv ( result_csv_file , index_col = 0 )
2025-10-21 17:34:48 +08:00
print ( " Getting performance results from file: " , result_json_file )
2025-10-20 09:33:17 +08:00
with open ( result_json_file , ' r ' , encoding = ' utf-8 ' ) as f :
self . result_json = json . load ( f )
2025-10-24 16:33:18 +08:00
self . result = [ self . result_csv , self . result_json ]
2025-10-20 09:33:17 +08:00
def _get_result_accuracy ( self ) :
acc_file = re . search ( r ' write csv to (.*) ' , self . result_line ) . group ( 1 )
df = pd . read_csv ( acc_file )
2025-10-24 16:33:18 +08:00
self . result = float ( df . loc [ 0 ] [ - 1 ] )
2025-10-20 09:33:17 +08:00
def _performance_verify ( self ) :
self . _get_result_performance ( )
output_throughput = self . result_json [ " Output Token Throughput " ] [
" total " ] . replace ( " token/s " , " " )
assert float (
output_throughput
) > = self . threshold * self . baseline , f " Performance verification failed. The current Output Token Throughput is { output_throughput } token/s, which is not greater than or equal to { self . threshold } * baseline { self . baseline } . "
def _accuracy_verify ( self ) :
2025-10-24 16:33:18 +08:00
self . _get_result_accuracy ( )
acc_value = self . result
2025-10-20 09:33:17 +08:00
assert self . baseline - self . threshold < = acc_value < = self . baseline + self . threshold , f " Accuracy verification failed. The accuracy of { self . dataset_path } is { acc_value } , which is not within { self . threshold } relative to baseline { self . baseline } . "
2025-10-30 23:42:20 +08:00
def run_aisbench_cases ( model ,
port ,
aisbench_cases ,
server_args = " " ,
host_ip = " localhost " ) :
2025-10-24 16:33:18 +08:00
aisbench_results = [ ]
2025-10-20 09:33:17 +08:00
aisbench_errors = [ ]
for aisbench_case in aisbench_cases :
2025-10-28 23:33:15 +08:00
if not aisbench_case :
continue
2025-10-20 09:33:17 +08:00
try :
2025-10-30 23:42:20 +08:00
with AisbenchRunner ( model = model ,
port = port ,
host_ip = host_ip ,
aisbench_config = aisbench_case ) as aisbench :
2025-10-24 16:33:18 +08:00
aisbench_results . append ( aisbench . result )
2025-10-20 09:33:17 +08:00
except Exception as e :
2025-10-24 16:33:18 +08:00
aisbench_results . append ( " " )
2025-10-20 09:33:17 +08:00
aisbench_errors . append ( [ aisbench_case , e ] )
print ( e )
for failed_case , error_info in aisbench_errors :
2025-10-28 23:33:15 +08:00
error_msg = f " The following aisbench case failed: { failed_case } , reason is { error_info } "
if server_args :
error_msg + = f " \n server_args are { server_args } "
logging . error ( error_msg )
2025-10-20 09:33:17 +08:00
assert not aisbench_errors , " some aisbench cases failed, info were shown above. "
2025-10-24 16:33:18 +08:00
return aisbench_results
def get_TTFT ( result ) :
TTFT = result [ 0 ] [ 0 ] . loc [ " TTFT " , " Average " ] [ : - 3 ]
return float ( TTFT )
2025-10-28 20:40:03 +08:00
temp_dir = tempfile . gettempdir ( )
def get_lock ( model_name_or_path : str | Path , cache_dir : str | None = None ) :
lock_dir = cache_dir or temp_dir
model_name_or_path = str ( model_name_or_path )
os . makedirs ( os . path . dirname ( lock_dir ) , exist_ok = True )
model_name = model_name_or_path . replace ( " / " , " - " )
hash_name = hashlib . sha256 ( model_name . encode ( ) ) . hexdigest ( )
# add hash to avoid conflict with old users' lock files
lock_file_name = hash_name + model_name + " .lock "
# mode 0o666 is required for the filelock to be shared across users
lock = filelock . FileLock ( os . path . join ( lock_dir , lock_file_name ) ,
mode = 0o666 )
return lock
def maybe_download_from_modelscope (
model : str ,
2025-10-29 22:30:19 +08:00
repo_type : str = " model " ,
2025-10-28 20:40:03 +08:00
revision : str | None = None ,
download_dir : str | None = None ,
ignore_patterns : str | list [ str ] | None = None ,
allow_patterns : list [ str ] | str | None = None ,
2025-10-29 22:30:19 +08:00
) - > str :
2025-10-28 20:40:03 +08:00
"""
Download model / dataset from ModelScope hub .
Returns the path to the downloaded model , or None if the model is not
downloaded from ModelScope .
"""
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock ( model , download_dir ) :
if not os . path . exists ( model ) :
model_path = snapshot_download (
model_id = model ,
repo_type = repo_type ,
cache_dir = download_dir ,
local_files_only = huggingface_hub . constants . HF_HUB_OFFLINE ,
revision = revision ,
ignore_file_pattern = ignore_patterns ,
allow_patterns = allow_patterns ,
)
else :
model_path = model
2025-10-29 22:30:19 +08:00
return model_path