[Nightly][Refactor]Migrate nightly single-node model tests from .py to .yaml (#6503)

### What this PR does / why we need it?
This PR refactors the nightly single-node model test by migrating test
configurations from Python scripts to a more maintainable `YAML-based`
format.

| Original PR | Python (`.py`) | YAML (`.yaml`) |
| :--- | :--- | :--- |
| [#3568](https://github.com/vllm-project/vllm-ascend/pull/3568) |
`test_deepseek_r1_0528_w8a8_eplb.py` | `DeepSeek-R1-0528-W8A8.yaml` |
| [#3631](https://github.com/vllm-project/vllm-ascend/pull/3631) |
`test_deepseek_r1_0528_w8a8.py` | `DeepSeek-R1-0528-W8A8.yaml` |
| [#5874](https://github.com/vllm-project/vllm-ascend/pull/5874) |
`test_deepseek_r1_w8a8_hbm.py` | `DeepSeek-R1-W8A8-HBM.yaml` |
| [#3908](https://github.com/vllm-project/vllm-ascend/pull/3908) |
`test_deepseek_v3_2_w8a8.py` | `DeepSeek-V3.2-W8A8.yaml` |
| [#5682](https://github.com/vllm-project/vllm-ascend/pull/5682) |
`test_kimi_k2_thinking.py` | `Kimi-K2-Thinking.yaml` |
| [#4111](https://github.com/vllm-project/vllm-ascend/pull/4111) |
`test_mtpx_deepseek_r1_0528_w8a8.py` | `MTPX-DeepSeek-R1-0528-W8A8.yaml`
|
| [#3733](https://github.com/vllm-project/vllm-ascend/pull/3733) |
`test_prefix_cache_deepseek_r1_0528_w8a8.py` |
`Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml` |
| [#6543](https://github.com/vllm-project/vllm-ascend/pull/6543) |
`test_qwen3_235b_w8a8.py` | `Qwen3-235B-A22B-W8A8.yaml` |
| [#6543](https://github.com/vllm-project/vllm-ascend/pull/6543) |
`test_qwen3_235b_a22b_w8a8_eplb.py` | `Qwen3-235B-A22B-W8A8.yaml` |
| [#3973](https://github.com/vllm-project/vllm-ascend/pull/3973) |
`test_qwen3_30b_w8a8.py` | `Qwen3-30B-A3B-W8A8.yaml` |
| [#3541](https://github.com/vllm-project/vllm-ascend/pull/3541) |
`test_qwen3_32b_int8.py` | `Qwen3-32B-Int8.yaml` |
| [#3757](https://github.com/vllm-project/vllm-ascend/pull/3757) |
`test_qwq_32b.py` | `QwQ-32B.yaml` |
| [#5616](https://github.com/vllm-project/vllm-ascend/pull/5616) |
`test_qwen3_next_w8a8.py` | `Qwen3-Next-80B-A3B-Instruct-W8A8.yaml` |
| [#3541](https://github.com/vllm-project/vllm-ascend/pull/3541) |
`test_qwen2_5_vl_7b.py` | `Qwen2.5-VL-7B-Instruct.yaml` |
| [#5301](https://github.com/vllm-project/vllm-ascend/pull/5301) |
`test_qwen2_5_vl_7b_epd.py` | `Qwen2.5-VL-7B-Instruct-EPD.yaml` |
| [#3707](https://github.com/vllm-project/vllm-ascend/pull/3707) |
`test_qwen2_5_vl_32b.py` | `Qwen2.5-VL-32B-Instruct.yaml` |
| [#3676](https://github.com/vllm-project/vllm-ascend/pull/3676) |
`test_qwen3_32b_int8_a3_feature_stack3.py` |
`Qwen3-32B-Int8-A3-Feature-Stack3.yaml` |
| [#3709](https://github.com/vllm-project/vllm-ascend/pull/3709) |
`test_prefix_cache_qwen3_32b_int8.py` |
`Prefix-Cache-Qwen3-32B-Int8.yaml` |
| [#5395](https://github.com/vllm-project/vllm-ascend/pull/5395) |
`test_qwen3_next.py` | `Qwen3-Next-80B-A3B-Instruct-A2.yaml` |
| [#3474](https://github.com/vllm-project/vllm-ascend/pull/3474) |
`test_qwen3_32b.py` | `Qwen3-32B.yaml` |
| [#3541](https://github.com/vllm-project/vllm-ascend/pull/3541) |
`test_qwen3_32b_int8.py` | `Qwen3-32B-Int8-A2.yaml` |
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
SILONG ZENG
2026-03-03 20:13:43 +08:00
committed by GitHub
parent a0a904a3d4
commit 859f2c25b9
51 changed files with 2265 additions and 2336 deletions

View File

@@ -28,7 +28,10 @@ on:
type: string
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11"
tests:
required: true
required: false
type: string
config_file_path:
required: false
type: string
name:
required: false
@@ -44,12 +47,12 @@ defaults:
# only cancel in-progress runs of the same workflow
# and ignore the lint / 1 card / 4 cards test type
concurrency:
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }}
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path || inputs.tests }}
cancel-in-progress: true
jobs:
e2e-nightly:
name: ${{ inputs.tests }}
name: ${{ inputs.name || inputs.config_file_path || inputs.tests }}
runs-on: ${{ inputs.runner }}
timeout-minutes: 600
container:
@@ -114,14 +117,33 @@ jobs:
update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20
- name: Run vllm-project/vllm-ascend test
- name: Validate Inputs
run: |
if [[ -z "${{ inputs.tests }}" && -z "${{ inputs.config_file_path }}" ]]; then
echo "Error: Either 'tests' or 'config_file_path' must be provided."
exit 1
fi
- name: Run Pytest (py-driven)
if: ${{ inputs.tests != '' }}
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
VLLM_CI_RUNNER: ${{ inputs.runner }}
BENCHMARK_HOME: /vllm-workspace/vllm-ascend/benchmark
working-directory: /vllm-workspace/vllm-ascend
run: |
# ignore test_dispatch_ffn_combine until the test is fixed
pytest -sv ${{ inputs.tests }} \
echo "Running pytest with tests path: ${{ inputs.tests }}"
pytest -sv "${{ inputs.tests }}" \
--ignore=tests/e2e/nightly/single_node/ops/singlecard_ops/test_fused_moe.py
- name: Run Pytest (YAML-driven)
if: ${{ always() && inputs.config_file_path != '' }}
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
VLLM_CI_RUNNER: ${{ inputs.runner }}
CONFIG_YAML_PATH: ${{ inputs.config_file_path }}
working-directory: /vllm-workspace/vllm-ascend
run: |
echo "Running YAML-driven test with config: ${{ inputs.config_file_path }}"
pytest -sv tests/e2e/nightly/single_node/models/scripts/test_single_node.py

View File

@@ -49,15 +49,6 @@ jobs:
fail-fast: false
matrix:
test_config:
- name: qwen3-next
os: linux-aarch64-a2b3-4
tests: tests/e2e/nightly/single_node/models/test_qwen3_next.py
- name: qwen3-32b
os: linux-aarch64-a2b3-4
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b.py
- name: qwen3-32b-in8-a2
os: linux-aarch64-a2b3-4
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py
- name: test_custom_op
os: linux-aarch64-a2b3-1
tests: tests/e2e/nightly/single_node/ops/singlecard_ops
@@ -71,10 +62,33 @@ jobs:
name: ${{ matrix.test_config.name }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
single-node-yaml-tests:
name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
matrix:
test_config:
- name: qwen3-32b
os: linux-aarch64-a2b3-4
config_file_path: Qwen3-32B.yaml
- name: qwen3-next-80b-a3b-instruct
os: linux-aarch64-a2b3-4
config_file_path: Qwen3-Next-80B-A3B-Instruct-A2.yaml
- name: qwen3-32b-int8
os: linux-aarch64-a2b3-4
config_file_path: Qwen3-32B-Int8-A2.yaml
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with:
runner: ${{ matrix.test_config.os }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
config_file_path: ${{ matrix.test_config.config_file_path }}
name: ${{ matrix.test_config.name }}
multi-node-tests:
name: multi-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: single-node-tests
needs: [single-node-tests, single-node-yaml-tests]
strategy:
fail-fast: false
max-parallel: 1

View File

@@ -109,65 +109,11 @@ jobs:
single-node-tests:
name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: multi-node-tests
needs: [multi-node-tests]
strategy:
fail-fast: false
matrix:
test_config:
- name: qwen3-32b-in8-a3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py
- name: qwen3-32b-int8-a3-feature-stack3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8_a3_feature_stack3.py
- name: qwen3-235b-a22b-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py
- name: deepseek-r1-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py
- name: deepseek-r1-w8a8-mtpx
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_mtpx_deepseek_r1_0528_w8a8.py
- name: qwen2-5-vl-7b
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b.py
- name: qwen2-5-vl-7b-epd
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py
- name: qwen2-5-vl-32b
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_32b.py
- name: qwen3-32b-int8-prefix-cache
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_prefix_cache_qwen3_32b_int8.py
- name: deepseek-r1-0528-w8a8
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py
- name: deepseek-r1-0528-w8a8-prefix-cache
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_prefix_cache_deepseek_r1_0528_w8a8.py
- name: qwq-32b-a3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_qwq_32b.py
- name: qwen3-30b-w8a8
os: linux-aarch64-a3-2
tests: tests/e2e/nightly/single_node/models/test_qwen3_30b_w8a8.py
- name: qwen3-235b-w8a8
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py
- name: qwen3-next-w8a8
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/single_node/models/test_qwen3_next_w8a8.py
- name: kimi-k2-thinking
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py
- name: deepseek-r1-w8a8-hbm
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py
- name: deepseek3_2-w8a8
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
- name: qwen3-30b-acc
os: linux-aarch64-a3-4
tests: tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py
@@ -178,6 +124,70 @@ jobs:
tests: ${{ matrix.test_config.tests }}
name: ${{ matrix.test_config.name }}
single-node-yaml-tests:
name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: [multi-node-tests]
strategy:
fail-fast: false
matrix:
test_config:
# YAML-driven tests
- name: deepseek-r1-0528-w8a8
os: linux-aarch64-a3-16
config_file_path: DeepSeek-R1-0528-W8A8.yaml
- name: deepseek-r1-w8a8-hbm
os: linux-aarch64-a3-16
config_file_path: DeepSeek-R1-W8A8-HBM.yaml
- name: deepseek-v3-2-w8a8
os: linux-aarch64-a3-16
config_file_path: DeepSeek-V3.2-W8A8.yaml
- name: kimi-k2-thinking
os: linux-aarch64-a3-16
config_file_path: Kimi-K2-Thinking.yaml
- name: mtpx-deepseek-r1-0528-w8a8
os: linux-aarch64-a3-16
config_file_path: MTPX-DeepSeek-R1-0528-W8A8.yaml
- name: qwen3-235b-a22b-w8a8
os: linux-aarch64-a3-16
config_file_path: Qwen3-235B-A22B-W8A8.yaml
- name: qwen3-30b-a3b-w8a8
os: linux-aarch64-a3-4
config_file_path: Qwen3-30B-A3B-W8A8.yaml
- name: qwen3-next-80b-a3b-instruct-w8a8
os: linux-aarch64-a3-4
config_file_path: Qwen3-Next-80B-A3B-Instruct-W8A8.yaml
- name: qwq-32b
os: linux-aarch64-a3-4
config_file_path: QwQ-32B.yaml
- name: qwen3-32b-int8
os: linux-aarch64-a3-4
config_file_path: Qwen3-32B-Int8.yaml
- name: qwen2-5-vl-7b
os: linux-aarch64-a3-4
config_file_path: Qwen2.5-VL-7B-Instruct.yaml
- name: qwen2-5-vl-7b-epd
os: linux-aarch64-a3-4
config_file_path: Qwen2.5-VL-7B-Instruct-EPD.yaml
- name: qwen2-5-vl-32b
os: linux-aarch64-a3-4
config_file_path: Qwen2.5-VL-32B-Instruct.yaml
- name: qwen3-32b-int8-a3-feature-stack3
os: linux-aarch64-a3-4
config_file_path: Qwen3-32B-Int8-A3-Feature-Stack3.yaml
- name: qwen3-32b-int8-prefix-cache
os: linux-aarch64-a3-4
config_file_path: Prefix-Cache-Qwen3-32B-Int8.yaml
- name: deepseek-r1-0528-w8a8-prefix-cache
os: linux-aarch64-a3-16
config_file_path: Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with:
runner: ${{ matrix.test_config.os }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
config_file_path: ${{ matrix.test_config.config_file_path }}
name: ${{ matrix.test_config.name }}
custom-ops-tests:
name: test ops
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')

View File

@@ -0,0 +1,94 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
OMP_NUM_THREADS: "10"
OMP_PROC_BIND: "false"
HCCL_BUFFSIZE: "1024"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--quantization"
- "ascend"
- "--data-parallel-size"
- "2"
- "--tensor-parallel-size"
- "8"
- "--enable-expert-parallel"
- "--port"
- "$SERVER_PORT"
- "--seed"
- "1024"
- "--max-model-len"
- "36864"
- "--max-num-batched-tokens"
- "4096"
- "--max-num-seqs"
- "16"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
- "--speculative-config"
- '{"num_speculative_tokens": 1, "method": "mtp"}'
- "--additional-config"
- '{"enable_weight_nz_layout": true}'
_benchmarks_acc: &benchmarks_acc
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 95
threshold: 5
_benchmarks_perf: &benchmarks_perf
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 400
max_out_len: 1500
batch_size: 1000
baseline: 1
threshold: 0.97
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "DeepSeek-R1-0528-W8A8-single"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--enforce-eager"
benchmarks:
- name: "DeepSeek-R1-0528-W8A8-aclgraph"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
benchmarks:
<<: *benchmarks_acc
<<: *benchmarks_perf
- name: "DeepSeek-R1-0528-W8A8-EPLB"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
envs:
<<: *envs
DYNAMIC_EPLB: "true"
server_cmd: *server_cmd
server_cmd_extra:
- "--additional-config"
- '{"enable_weight_nz_layout": true, "eplb_config": {"dynamic_eplb": "true", "expert_heat_collection_interval": 1000, "algorithm_execution_interval": 50, "eplb_policy_type": 3}}'
benchmarks:
<<: *benchmarks_acc

View File

@@ -0,0 +1,42 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "DeepSeek-R1-W8A8-HBM-single"
model: "vllm-ascend/DeepSeek-R1-W8A8"
envs:
HCCL_BUFFSIZE: "1024"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--quantization"
- "ascend"
- "--port"
- "$SERVER_PORT"
- "--data-parallel-size"
- "8"
- "--data-parallel-size-local"
- "8"
- "--data-parallel-rpc-port"
- "13389"
- "--tensor-parallel-size"
- "2"
- "--enable-expert-parallel"
- "--seed"
- "1024"
- "--max-num-seqs"
- "32"
- "--max-model-len"
- "6000"
- "--max-num-batched-tokens"
- "6000"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.92"
- "--no-enable-prefix-caching"
- "--reasoning-parser"
- "deepseek_r1"
- "--enforce-eager"
- "--additional-config"
- '{"ascend_scheduler_config": {"enabled": false}, "torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}}'
benchmarks:

View File

@@ -0,0 +1,78 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "DeepSeek-V3.2-W8A8-TP8-DP2"
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
envs:
HCCL_OP_EXPANSION_MODE: "AIV"
OMP_PROC_BIND: "false"
OMP_NUM_THREADS: "1"
HCCL_BUFFSIZE: "1024"
VLLM_ASCEND_ENABLE_MLAPO: "1"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
VLLM_ASCEND_ENABLE_FLASHCOMM1: "1"
VLLM_ENGINE_READY_TIMEOUT_S: "1800"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--enable-expert-parallel"
- "--tensor-parallel-size"
- "8"
- "--data-parallel-size"
- "2"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "8192"
- "--max-num-batched-tokens"
- "8192"
- "--max-num-seqs"
- "4"
- "--trust-remote-code"
- "--quantization"
- "ascend"
- "--gpu-memory-utilization"
- "0.98"
- "--compilation-config"
- '{"cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48], "cudagraph_mode":"FULL_DECODE_ONLY"}'
- "--speculative-config"
- '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
- "--additional-config"
- '{"layer_sharding": ["q_b_proj", "o_proj"]}'
- "--reasoning-parser"
- "deepseek_v3"
- "--tokenizer_mode"
- "deepseek_v32"
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 8
baseline: 95
threshold: 5
perf_1:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 1
max_out_len: 1500
batch_size: 1
request_rate: 11.2
baseline: 134
threshold: 0.97
perf_2:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 100
max_out_len: 1500
batch_size: 4
request_rate: 11.2
baseline: 134
threshold: 0.97

View File

@@ -0,0 +1,72 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
HCCL_BUFFSIZE: "1024"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--no-enable-prefix-caching"
- "--enable-expert-parallel"
- "--tensor-parallel-size"
- "8"
- "--data-parallel-size"
- "2"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "8192"
- "--max-num-batched-tokens"
- "8192"
- "--block-size"
- "16"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
_benchmarks: &benchmarks
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 8
baseline: 95
threshold: 5
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 16
max_out_len: 1500
batch_size: 8
request_rate: 0
baseline: 1
threshold: 0.97
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "GLM-4.5-TP8-DP2-fullgraph"
model: "ZhipuAI/GLM-4.5"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--compilation-config"
- '{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
benchmarks:
<<: *benchmarks
- name: "GLM-4.5-TP8-DP2-eager"
model: "ZhipuAI/GLM-4.5"
envs:
<<: *envs
server_cmd: *server_cmd
benchmarks:
<<: *benchmarks

View File

@@ -0,0 +1,52 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Kimi-K2-Thinking-TP16-Case"
model: "moonshotai/Kimi-K2-Thinking"
envs:
HCCL_BUFFSIZE: "1024"
TASK_QUEUE_ENABLE: "1"
OMP_PROC_BIND: "false"
HCCL_OP_EXPANSION_MODE: "AIV"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--tensor-parallel-size"
- "16"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "8192"
- "--max-num-batched-tokens"
- "8192"
- "--max-num-seqs"
- "12"
- "--gpu-memory-utilization"
- "0.9"
- "--trust-remote-code"
- "--enable-expert-parallel"
- "--no-enable-prefix-caching"
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 32
baseline: 95
threshold: 5
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 512
max_out_len: 256
batch_size: 64
trust_remote_code: true
request_rate: 11.2
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,90 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
OMP_NUM_THREADS: "100"
OMP_PROC_BIND: "false"
HCCL_BUFFSIZE: "1024"
VLLM_RPC_TIMEOUT: "3600000"
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "3600000"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--quantization"
- "ascend"
- "--seed"
- "1024"
- "--no-enable-prefix-caching"
- "--data-parallel-size"
- "2"
- "--tensor-parallel-size"
- "8"
- "--enable-expert-parallel"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "40960"
- "--max-num-seqs"
- "14"
- "--trust-remote-code"
_benchmarks_gsm8k: &benchmarks_gsm8k
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 95
threshold: 5
_benchmarks_aime: &benchmarks_aime
acc:
case_type: accuracy
dataset_path: vllm-ascend/aime2024
request_conf: vllm_api_general_chat
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 86.67
threshold: 7
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "MTPX-DeepSeek-R1-0528-W8A8-mtp2"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--max-num-batched-tokens"
- "4096"
- "--speculative-config"
- '{"num_speculative_tokens": 2, "method": "mtp"}'
- "--gpu-memory-utilization"
- "0.92"
benchmarks:
<<: *benchmarks_gsm8k
- name: "MTPX-DeepSeek-R1-0528-W8A8-mtp3"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
envs:
<<: *envs
HCCL_OP_EXPANSION_MODE: "AIV"
server_cmd: *server_cmd
server_cmd_extra:
- "--max-num-batched-tokens"
- "2048"
- "--speculative-config"
- '{"num_speculative_tokens": 3, "method": "mtp"}'
- "--gpu-memory-utilization"
- "0.9"
- "--compilation-config"
- '{"cudagraph_capture_sizes": [56], "cudagraph_mode": "FULL_DECODE_ONLY"}'
benchmarks:
<<: *benchmarks_aime

View File

@@ -0,0 +1,77 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "prefix-cache-deepseek-r1-0528-w8a8"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
envs:
OMP_NUM_THREADS: "10"
OMP_PROC_BIND: "false"
HCCL_BUFFSIZE: "1024"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--quantization"
- "ascend"
- "--data-parallel-size"
- "2"
- "--tensor-parallel-size"
- "8"
- "--enable-expert-parallel"
- "--port"
- "$SERVER_PORT"
- "--seed"
- "1024"
- "--max-model-len"
- "5200"
- "--max-num-batched-tokens"
- "4096"
- "--max-num-seqs"
- "16"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
- "--additional-config"
- '{"enable_weight_nz_layout": true}'
- "--speculative-config"
- '{"num_speculative_tokens": 1, "method": "mtp"}'
test_content:
- "benchmark_comparisons"
benchmark_comparisons_args:
- metric: "TTFT"
baseline: "prefix0"
target: "prefix75"
ratio: 0.8
operator: "<"
benchmarks:
warm_up:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in1024-bs210
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 210
max_out_len: 2
batch_size: 1000
baseline: 0
threshold: 0.97
prefix0:
case_type: performance
dataset_path: vllm-ascend/prefix0-in3500-bs210
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 210
max_out_len: 1500
batch_size: 18
baseline: 1
threshold: 0.97
prefix75:
case_type: performance
dataset_path: vllm-ascend/prefix75-in3500-bs210
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 210
max_out_len: 1500
batch_size: 18
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,70 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "prefix-cache-qwen3-32b-w8a8"
model: "vllm-ascend/Qwen3-32B-W8A8"
envs:
TASK_QUEUE_ENABLE: "1"
HCCL_OP_EXPANSION_MODE: "AIV"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--quantization"
- "ascend"
- "--reasoning-parser"
- "qwen3"
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "8192"
- "--max-num-batched-tokens"
- "8192"
- "--max-num-seqs"
- "256"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
- "--additional-config"
- '{"enable_weight_nz_layout": true}'
test_content:
- "benchmark_comparisons"
benchmark_comparisons_args:
- metric: "TTFT"
baseline: "prefix0"
target: "prefix75"
ratio: 0.8
operator: "<"
benchmarks:
warm_up:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in1024-bs210
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 210
max_out_len: 2
batch_size: 1000
baseline: 0
threshold: 0.97
prefix0:
case_type: performance
dataset_path: vllm-ascend/prefix0-in3500-bs210
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 210
max_out_len: 1500
batch_size: 48
baseline: 1
threshold: 0.97
prefix75:
case_type: performance
dataset_path: vllm-ascend/prefix75-in3500-bs210
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 210
max_out_len: 1500
batch_size: 48
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,78 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
TASK_QUEUE_ENABLE: "1"
OMP_PROC_BIND: "false"
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
VLLM_ASCEND_ENABLE_DEBSE_OPTIMIZE: "1"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "36864"
- "--max-num-batched-tokens"
- "36864"
- "--block-size"
- "128"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
- "--reasoning-parser"
- "deepseek_r1"
- "--distributed_executor_backend"
- "mp"
- "--additional-config"
- '{"weight_prefetch_config":{"enabled":true}}'
_benchmarks: &benchmarks
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 95
threshold: 5
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 240
max_out_len: 1500
batch_size: 60
baseline: 1
threshold: 0.97
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "QwQ-32B-aclgraph"
model: "Qwen/QwQ-32B"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--compilation_config"
- '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
benchmarks:
<<: *benchmarks
- name: "QwQ-32B-single"
model: "Qwen/QwQ-32B"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--enforce-eager"
benchmarks:

View File

@@ -0,0 +1,63 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen2.5-VL-32B-Instruct-a3"
model: "Qwen/Qwen2.5-VL-32B-Instruct"
envs:
TASK_QUEUE_ENABLE: "1"
VLLM_ASCEND_ENABLE_NZ: "0"
HCCL_OP_EXPANSION_MODE: "AIV"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--no-enable-prefix-caching"
- "--mm-processor-cache-gb"
- "0"
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "30000"
- "--max-num-batched-tokens"
- "40000"
- "--max-num-seqs"
- "400"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.8"
- "--compilation_config"
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
test_content:
- "completion"
- "image"
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/textvqa-lite
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
max_out_len: 2048
batch_size: 128
baseline: 76.22
temperature: 0
top_k: -1
top_p: 1
repetition_penalty: 1
threshold: 5
perf:
case_type: performance
dataset_path: vllm-ascend/textvqa-perf-1080p
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
num_prompts: 512
max_out_len: 256
batch_size: 128
temperature: 0
top_k: -1
top_p: 1
repetition_penalty: 1
request_rate: 0
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,92 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen2.5-VL-7B-Instruct-epd"
model: "Qwen/Qwen2.5-VL-7B-Instruct"
service_mode: "epd"
envs:
ENCODE_PORT: "DEFAULT_PORT"
PD_PORT: "DEFAULT_PORT"
PROXY_PORT: "DEFAULT_PORT"
epd_server_cmds:
- - "--port"
- "$ENCODE_PORT"
- "--model"
- "Qwen/Qwen2.5-VL-7B-Instruct"
- "--gpu-memory-utilization"
- "0.01"
- "--tensor-parallel-size"
- "1"
- "--enforce-eager"
- "--no-enable-prefix-caching"
- "--max-model-len"
- "10000"
- "--max-num-batched-tokens"
- "10000"
- "--max-num-seqs"
- "1"
- "--ec-transfer-config"
- '{"ec_connector_extra_config":{"shared_storage_path":"/dev/shm/epd/storage"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
- - "--port"
- "$PD_PORT"
- "--model"
- "Qwen/Qwen2.5-VL-7B-Instruct"
- "--gpu-memory-utilization"
- "0.95"
- "--tensor-parallel-size"
- "1"
- "--enforce-eager"
- "--max-model-len"
- "10000"
- "--max-num-batched-tokens"
- "10000"
- "--max-num-seqs"
- "128"
- "--ec-transfer-config"
- '{"ec_connector_extra_config":{"shared_storage_path":"/dev/shm/epd/storage"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
epd_proxy_args:
- "--host"
- "127.0.0.1"
- "--port"
- "$PROXY_PORT"
- "--encode-servers-urls"
- "http://localhost:$ENCODE_PORT"
- "--decode-servers-urls"
- "http://localhost:$PD_PORT"
- "--prefill-servers-urls"
- "disable"
test_content:
benchmarks:
warm_up:
case_type: performance
dataset_path: vllm-ascend/textvqa-perf-1080p
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
num_prompts: 50
max_out_len: 20
batch_size: 32
request_rate: 0
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/textvqa-lite
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
max_out_len: 2048
batch_size: 128
baseline: 82.05
threshold: 5
perf:
case_type: performance
dataset_path: vllm-ascend/textvqa-perf-1080p
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
num_prompts: 512
max_out_len: 256
batch_size: 128
request_rate: 0
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,55 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen2.5-VL-7B-Instruct"
model: "Qwen/Qwen2.5-VL-7B-Instruct"
envs:
TASK_QUEUE_ENABLE: "1"
VLLM_ASCEND_ENABLE_NZ: "0"
HCCL_OP_EXPANSION_MODE: "AIV"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--no-enable-prefix-caching"
- "--mm-processor-cache-gb"
- "0"
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "30000"
- "--max-num-batched-tokens"
- "40000"
- "--max-num-seqs"
- "400"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.8"
- "--compilation_config"
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
test_content:
- "completion"
- "image"
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/textvqa-lite
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
max_out_len: 2048
batch_size: 128
baseline: 82.05
threshold: 5
perf:
case_type: performance
dataset_path: vllm-ascend/textvqa-perf-1080p
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
num_prompts: 512
max_out_len: 256
batch_size: 128
request_rate: 0
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,85 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
OMP_NUM_THREADS: "10"
OMP_PROC_BIND: "false"
HCCL_BUFFSIZE: "1024"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
VLLM_ASCEND_ENABLE_FLASHCOMM1: "1"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--quantization"
- "ascend"
- "--async-scheduling"
- "--data-parallel-size"
- "4"
- "--tensor-parallel-size"
- "4"
- "--enable-expert-parallel"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "40960"
- "--max-num-batched-tokens"
- "8192"
- "--max-num-seqs"
- "12"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
_benchmarks: &benchmarks
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
top_k: 20
baseline: 95
threshold: 5
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-235B-A22B-W8A8-full_graph"
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--compilation-config"
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
benchmarks:
<<: *benchmarks
- name: "Qwen3-235B-A22B-W8A8-piecewise"
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--compilation-config"
- '{"cudagraph_mode": "PIECEWISE"}'
benchmarks:
<<: *benchmarks
- name: "Qwen3-235B-A22B-W8A8-EPLB"
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
envs:
<<: *envs
DYNAMIC_EPLB: "true"
server_cmd: *server_cmd
server_cmd_extra:
- "--additional-config"
- '{"eplb_config": {"dynamic_eplb": "true", "expert_heat_collection_interval": 600, "algorithm_execution_interval": 50, "num_redundant_experts": 16, "eplb_policy_type": 2}}'
- "--compilation-config"
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
benchmarks:
<<: *benchmarks

View File

@@ -0,0 +1,46 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-30B-A3B-W8A8-TP1"
model: "vllm-ascend/Qwen3-30B-A3B-W8A8"
envs:
OMP_PROC_BIND: "false"
OMP_NUM_THREADS: "10"
HCCL_BUFFSIZE: "1024"
HCCL_OP_EXPANSION_MODE: "AIV"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--quantization"
- "ascend"
- "--async-scheduling"
- "--no-enable-prefix-caching"
- "--tensor-parallel-size"
- "1"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "5600"
- "--max-num-batched-tokens"
- "16384"
- "--max-num-seqs"
- "100"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
- "--compilation-config"
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 180
max_out_len: 1500
batch_size: 45
request_rate: 0
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,79 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
TASK_QUEUE_ENABLE: "1"
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--quantization"
- "ascend"
- "--no-enable-prefix-caching"
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "40960"
- "--max-num-batched-tokens"
- "40960"
- "--block-size"
- "128"
- "--trust-remote-code"
- "--reasoning-parser"
- "qwen3"
- "--gpu-memory-utilization"
- "0.9"
- "--async-scheduling"
- "--additional-config"
- '{"weight_prefetch_config":{"enabled":true}}'
_benchmarks: &benchmarks
acc:
case_type: accuracy
dataset_path: vllm-ascend/aime2024
request_conf: vllm_api_general_chat
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 83.33
threshold: 7
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 288
max_out_len: 1500
batch_size: 72
baseline: 1
threshold: 0.97
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-32B-W8A8-aclgraph-a2"
model: "vllm-ascend/Qwen3-32B-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--compilation-config"
- '{"cudagraph_mode":"FULL_DECODE_ONLY","cudagraph_capture_sizes":[1,12,16,20,24,32,48,60,64,68,72,76,80]}'
benchmarks:
<<: *benchmarks
- name: "Qwen3-32B-W8A8-single-a2"
model: "vllm-ascend/Qwen3-32B-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--enforce-eager"
benchmarks:

View File

@@ -0,0 +1,69 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-32B-W8A8-a3-feature-stack3"
model: "vllm-ascend/Qwen3-32B-W8A8"
envs:
VLLM_USE: "1"
TASK_QUEUE_ENABLE: "1"
HCCL_OP_EXPANSION_MODE: "AIV"
OMP_PROC_BIND: "false"
VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE: "1"
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
SERVER_PORT: "DEFAULT_PORT"
prompts:
- "9.11 and 9.8, which is greater?"
api_keyword_args:
chat_template_kwargs:
enable_thinking: true
server_cmd:
- "--quantization"
- "ascend"
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--trust-remote-code"
- "--reasoning-parser"
- "qwen3"
- "--distributed_executor_backend"
- "mp"
- "--gpu-memory-utilization"
- "0.9"
- "--block-size"
- "128"
- "--max-num-seqs"
- "256"
- "--enforce-eager"
- "--max-model-len"
- "35840"
- "--max-num-batched-tokens"
- "35840"
- "--additional-config"
- '{"enable_weight_nz_layout":true, "weight_prefetch_config":{"enabled": true}}'
- "--compilation-config"
- '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
test_content:
- "chat_completion"
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt
max_out_len: 10240
batch_size: 32
baseline: 96
threshold: 4
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 240
max_out_len: 1500
batch_size: 60
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,78 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
TASK_QUEUE_ENABLE: "1"
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--quantization"
- "ascend"
- "--no-enable-prefix-caching"
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "40960"
- "--max-num-batched-tokens"
- "40960"
- "--block-size"
- "128"
- "--trust-remote-code"
- "--reasoning-parser"
- "qwen3"
- "--gpu-memory-utilization"
- "0.9"
- "--async-scheduling"
- "--additional-config"
- '{"weight_prefetch_config":{"enabled":true}}'
_benchmarks: &benchmarks
acc:
case_type: accuracy
dataset_path: vllm-ascend/aime2024
request_conf: vllm_api_general_chat
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 83.33
threshold: 7
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 304
max_out_len: 1500
batch_size: 76
baseline: 1
threshold: 0.97
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-32B-W8A8-aclgraph-a3"
model: "vllm-ascend/Qwen3-32B-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--compilation-config"
- '{"cudagraph_mode":"FULL_DECODE_ONLY","cudagraph_capture_sizes":[1,12,16,20,24,32,48,60,64,68,72,76,80]}'
benchmarks:
<<: *benchmarks
- name: "Qwen3-32B-W8A8-single-a3"
model: "vllm-ascend/Qwen3-32B-W8A8"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--enforce-eager"
benchmarks:

View File

@@ -0,0 +1,51 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-32B-TP4"
model: "Qwen/Qwen3-32B"
envs:
TASK_QUEUE_ENABLE: "1"
OMP_PROC_BIND: "false"
HCCL_OP_EXPANSION_MODE: "AIV"
PAGED_ATTENTION_MASK_LEN: "5500"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--no-enable-prefix-caching"
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "36864"
- "--max-num-batched-tokens"
- "36864"
- "--block-size"
- "128"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.9"
- "--additional-config"
- '{"enable_weight_nz_layout":true}'
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 95
threshold: 5
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 80
max_out_len: 1500
batch_size: 20
request_rate: 0
baseline: 1
threshold: 0.97

View File

@@ -0,0 +1,75 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
OMP_NUM_THREADS: "10"
OMP_PROC_BIND: "false"
HCCL_BUFFSIZE: "1024"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "40960"
- "--trust-remote-code"
- "--async-scheduling"
- "--no-enable-prefix-caching"
- "--enable-expert-parallel"
- "--gpu-memory-utilization"
- "0.8"
- "--max-num-seqs"
- "64"
_benchmarks: &benchmarks
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 256
max_out_len: 1500
batch_size: 64
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
top_k: 20
baseline: 95
threshold: 5
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-8192-a2"
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--max-num-batched-tokens"
- "8192"
benchmarks:
<<: *benchmarks
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-32768-a2"
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--max-num-batched-tokens"
- "32768"
benchmarks:
<<: *benchmarks

View File

@@ -0,0 +1,45 @@
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-Next-80B-A3B-Instruct-W8A8"
model: "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"
envs:
OMP_NUM_THREADS: "10"
OMP_PROC_BIND: "false"
HCCL_BUFFSIZE: "1024"
SERVER_PORT: "DEFAULT_PORT"
server_cmd:
- "--quantization"
- "ascend"
- "--async-scheduling"
- "--no-enable-prefix-caching"
- "--data-parallel-size"
- "1"
- "--tensor-parallel-size"
- "4"
- "--enable-expert-parallel"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "40960"
- "--max-num-batched-tokens"
- "8192"
- "--max-num-seqs"
- "32"
- "--trust-remote-code"
- "--gpu-memory-utilization"
- "0.65"
- "--compilation-config"
- '{"cudagraph_capture_sizes": [32]}'
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
baseline: 95
threshold: 5

View File

@@ -0,0 +1,75 @@
# ==========================================
# Shared Configurations
# ==========================================
_envs: &envs
OMP_NUM_THREADS: "10"
OMP_PROC_BIND: "false"
HCCL_BUFFSIZE: "1024"
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
SERVER_PORT: "DEFAULT_PORT"
_server_cmd: &server_cmd
- "--tensor-parallel-size"
- "4"
- "--port"
- "$SERVER_PORT"
- "--max-model-len"
- "40960"
- "--trust-remote-code"
- "--async-scheduling"
- "--no-enable-prefix-caching"
- "--enable-expert-parallel"
- "--gpu-memory-utilization"
- "0.8"
- "--max-num-seqs"
- "64"
_benchmarks: &benchmarks
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 256
max_out_len: 1500
batch_size: 64
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 32
top_k: 20
baseline: 95
threshold: 5
# ==========================================
# ACTUAL TEST CASES
# ==========================================
test_cases:
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-8192-a3"
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--max-num-batched-tokens"
- "8192"
benchmarks:
<<: *benchmarks
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-32768-a3"
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
envs:
<<: *envs
server_cmd: *server_cmd
server_cmd_extra:
- "--max-num-batched-tokens"
- "32768"
benchmarks:
<<: *benchmarks

View File

@@ -0,0 +1,312 @@
# vLLM-Ascend Single-Node E2E Test Developer Guide
This document is intended to help developers understand the architecture of the single-node E2E (End-to-End) testing framework in `vllm-ascend`, how to run test scripts, and how to add custom testing functionality by writing YAML configuration files and extending the code.
## 1. Test Architecture Overview
To achieve high readability, extensibility, and decoupling of configuration from code, the single-node E2E test adopts a **"YAML-driven + Dispatcher"** architectural structure.
It consists of the following core components:
* **Configuration Parser (`single_node_config.py`)**: Responsible for reading `models/configs/*.yaml` files and parsing them into a strongly-typed `@dataclass` (`SingleNodeConfig`) via `SingleNodeConfigLoader`, while handling regex replacement for environment variables.
* **Service Manager Framework (`test_single_node.py` and `conftest.py`)**: Based on the `service_mode` (`openai` or `epd`), it utilizes context managers to safely start/stop server processes.
* **Test Function Dispatcher (`TEST_HANDLERS` Registry)**: Specific test logic is encapsulated into independent functions and registered in the global `TEST_HANDLERS` dictionary.
* **Performance Benchmarking (`_run_benchmarks`)**: Calls `aisbench` for performance and TTFT testing based on the `benchmarks` parameters in the YAML.
### 1.1 Key Files and Responsibilities
* `tests/e2e/nightly/single_node/models/scripts/single_node_config.py`
* Defines `SingleNodeConfig` and `SingleNodeConfigLoader`
* Loads YAML from `tests/e2e/nightly/single_node/models/configs/<CONFIG_YAML_PATH>`
* Auto-assigns ports when `envs` contains `DEFAULT_PORT` / missing values
* Expands `$VAR` / `${VAR}` placeholders inside commands via `_expand_values`
* `tests/e2e/nightly/single_node/models/scripts/test_single_node.py`
* Declares `configs = SingleNodeConfigLoader.from_yaml_cases()` (loaded at import time)
* `pytest.mark.parametrize("config", configs, ids=[config.name for config in configs])` runs one test per YAML case
* Controls server lifecycle via context managers
* Dispatches `test_content` to functions registered in `TEST_HANDLERS`
* Runs `aisbench` and optional benchmark assertions
### 1.2 End-to-End Flow (High Level)
```txt
pytest starts
|
v
import tests/e2e/nightly/single_node/models/scripts/test_single_node.py
|
v
configs = SingleNodeConfigLoader.from_yaml_cases()
|
v
pytest parametrize("config", configs) # one config == one test case
|
v
test_single_node(config)
|
+-----------------------------------------------+
| Start service (depends on service_mode) |
| |
| openai: start one vLLM OpenAI-compatible |
| service process |
| epd: start (encode service + decode/PD |
| service) + start proxy process |
+-----------------------------------------------+
|
v
Run test phases (test_content)
|
v
Optional benchmarks (if benchmarks is configured)
|
v
Shutdown all started processes
Notes:
- One YAML file may contain multiple test_cases; pytest will run them one by one.
- The framework is "YAML-driven": changes are typically done by editing YAML rather than editing Python code.
```
### 1.3 Function Call Relationships (Dispatcher)
`test_content` is a list of “phases”. Each phase maps to one handler function.
```txt
For each test_case:
test_content (list of phases)
|
v
[Dispatcher]
|
+--> phase "completion" -> send completion request(s)
|
+--> phase "chat_completion" -> send chat completion request(s)
|
+--> phase "image" -> send multimodal image request(s)
|
\--> (extendable) add your own phase by registering a new handler
After phases:
if benchmarks is configured -> run aisbench
Notes:
- The dispatcher only controls "what to run"; service lifecycle is controlled by the service manager.
- Phases are intentionally small & composable so you can reuse them across YAML cases.
```
## 2. Running and Debugging Steps
### 2.1 Dependencies
Ensure you are in an NPU environment and have installed `pytest`, `pyyaml`, `openai`, and `aisbench`.
### 2.2 Local Execution
The framework uses the `CONFIG_YAML_PATH` environment variable to specify the configuration file.
```bash
# Switch to the project root directory
cd /vllm-workspace/vllm-ascend
# Run a specific yaml test
export CONFIG_YAML_PATH="Qwen3-32B.yaml"
pytest -sv tests/e2e/nightly/single_node/models/scripts/test_single_node.py
```
### 2.3 Tips for Debugging
* Only run a subset of cases: `pytest -sv ... -k <keyword>` (matches case names in the report output)
* Stop on first failure: `pytest -sv ... -x`
* Keep server logs visible: use `-s` (already included in `-sv`) and increase log verbosity via standard Python logging configuration if needed.
## 3. How to Write YAML Configuration Files
### 3.1 File Location and Selection Rules
* YAML files live under: `tests/e2e/nightly/single_node/models/configs/`
* Selected by env var: `CONFIG_YAML_PATH=<YourConfig>.yaml`
* If not set, the loader uses `SingleNodeConfigLoader.DEFAULT_CONFIG_NAME`
### 3.2 Field Descriptions
| Field Name | Type | Required | Default Value | Description |
| :--------------- | :--------- | :------- | :-------------- | :------------------------------------------------------------------ |
| `test_cases` | list | **Yes** | - | List of test case objects |
| `name` | string | **Yes** | - | Human-readable case ID shown in pytest output and logs |
| `model` | string | **Yes** | - | Model name or local path |
| `service_mode` | string | No | `openai` | Service mode: `openai` or `epd` (disaggregated) |
| `envs` | map | **Yes** | `{}` | Environment variables for the server process |
| `server_cmd` | list | Cond. | `[]` | vLLM startup arguments (Required for non-EPD) |
| `server_cmd_extra` | list | No | `[]` | Extra vLLM startup arguments appended after `server_cmd` |
| `prompts` | list | No | built-in default | Prompts for completion/chat tests |
| `api_keyword_args` | map | No | built-in default | OpenAI API keyword args (e.g., `max_tokens`, sampling params) |
| `test_content` | list | No | `["completion"]` | Test phases: `completion`, `chat_completion`, `image` etc. |
| `benchmarks` | map | No | `{}` | Configuration for `aisbench` performance verification |
| `epd_server_cmds`| list[list] | Cond. | `[]` | (EPD Only) Command arrays for starting dual Encode/Decode processes |
| `epd_proxy_args` | list | Cond. | `[]` | (EPD Only) Startup arguments for the EPD routing gateway |
**Notes / Behaviors**
* `name` is mandatory and must be a non-empty string.
* It is used directly as pytest case id (e.g., `test_single_node[DeepSeek-R1-0528-W8A8-single]`).
* It is also printed in `[single-node][START]` marker for log navigation.
* `envs` (ports): the config object recognizes these keys: `SERVER_PORT`, `ENCODE_PORT`, `PD_PORT`, `PROXY_PORT`.
* If a port key is missing or set to `DEFAULT_PORT`, it will be automatically filled with an available open port.
* `$SERVER_PORT` / `${SERVER_PORT}` placeholders in commands will be expanded using `envs`.
* `server_cmd` vs `server_cmd_extra`:
* YAML can define `server_cmd_extra` to append additional args after `server_cmd`.
* The loader merges them into a single `server_cmd` list.
* Extra fields:
* Any non-standard fields in a case are stored in `config.extra_config`.
* This is how extension configs are passed through without changing the dataclass.
### 3.3 YAML Examples
#### Single-Case (similar to DeepSeek-R1-W8A8-HBM)
```yaml
test_cases:
- name: "<your-case-name>"
model: "<model-repo-or-local-path>"
# Optional: The default values are as follows
prompts:
- "San Francisco is a"
api_keyword_args:
max_tokens: 10
envs:
SERVER_PORT: "DEFAULT_PORT"
# Add only what you need.
server_cmd:
- "--port"
- "$SERVER_PORT"
# plus your vLLM serve args...
# Optional: omit -> defaults to ["completion"]
test_content:
- "chat_completion"
# Optional: leave empty if you don't run aisbench
benchmarks:
```
#### Multi-Case + Shared Anchors
```yaml
_envs: &envs
SERVER_PORT: "DEFAULT_PORT"
# shared envs...
_server_cmd: &server_cmd
- "--port"
- "$SERVER_PORT"
# shared vLLM serve args...
_benchmarks: &benchmarks
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 400
max_out_len: 1500
batch_size: 1000
baseline: 1
threshold: 0.97
test_cases:
- name: "case-a"
model: "<model>"
envs:
<<: *envs
DYNAMIC_EPLB: "true"
# private envs...
server_cmd: *server_cmd
server_cmd_extra:
- "--enforce-eager"
benchmarks:
- name: "case-b"
model: "<model>"
envs:
<<: *envs
server_cmd: *server_cmd
benchmarks:
<<: *benchmarks_acc
```
#### EPD / Disaggregated Case
```yaml
test_cases:
- name: "<your-epd-case>"
model: "<model>"
service_mode: "epd"
envs:
ENCODE_PORT: "DEFAULT_PORT"
PD_PORT: "DEFAULT_PORT"
PROXY_PORT: "DEFAULT_PORT"
epd_server_cmds:
- ["--port", "$ENCODE_PORT", "--model", "<encode-model>"]
- ["--port", "$PD_PORT", "--model", "<decode-model>"]
epd_proxy_args:
- "--host"
- "127.0.0.1"
- "--port"
- "$PROXY_PORT"
- "--encode-servers-urls"
- "http://localhost:$ENCODE_PORT"
- "--decode-servers-urls"
- "http://localhost:$PD_PORT"
- "--prefill-servers-urls"
- "disable"
test_content:
- "chat_completion"
```
## 4. How to Add Custom Tests (Extension)
### Step 1: Write your test logic in `test_single_node.py`
```python
async def run_video_test(config: SingleNodeConfig, server: 'RemoteOpenAIServer | DisaggEpdProxy') -> None:
client = server.get_async_client()
# Your custom logic here...
```
### Step 2: Register your function in `TEST_HANDLERS`
```python
TEST_HANDLERS = {
"completion": run_completion_test,
"video": run_video_test, # Registered!
}
```
### Step 3: Enable in YAML
```yaml
test_content:
- "completion"
- "video"
```
## 5. Checklist (Before Submitting a New YAML)
* `test_cases` exists and is a list
* Each case contains required fields for its `service_mode`
* Common required: `name`, `model`, `envs`
* `openai`: `server_cmd`
* `epd`: `epd_server_cmds`, `epd_proxy_args`
* Port envs are set to `DEFAULT_PORT` (or to explicit free ports)
* If using `benchmarks`, ensure each benchmark case includes required aisbench fields (e.g., `case_type`, `dataset_path`, `request_conf`, `dataset_conf`, `max_out_len`, `batch_size`)

View File

@@ -0,0 +1,16 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#

View File

@@ -0,0 +1,183 @@
import logging
import os
import re
from dataclasses import dataclass, field
from typing import Any
import yaml
from vllm.utils.network_utils import get_open_port
CONFIG_BASE_PATH = "tests/e2e/nightly/single_node/models/configs"
logger = logging.getLogger(__name__)
# Default prompts and API args fallback
PROMPTS = [
"San Francisco is a",
]
API_KEYWORD_ARGS = {
"max_tokens": 10,
}
@dataclass
class SingleNodeConfig:
name: str
model: str
envs: dict[str, Any] = field(default_factory=dict)
prompts: list[str] = field(default_factory=lambda: PROMPTS)
api_keyword_args: dict[str, Any] = field(default_factory=lambda: API_KEYWORD_ARGS)
benchmarks: dict[str, Any] = field(default_factory=dict)
server_cmd: list[str] = field(default_factory=list)
test_content: list[str] = field(default_factory=lambda: ["completion"])
service_mode: str = "openai"
epd_server_cmds: list[list[str]] = field(default_factory=list)
epd_proxy_args: list[str] = field(default_factory=list)
extra_config: dict[str, Any] = field(default_factory=dict)
def __post_init__(self) -> None:
port_keys = ["SERVER_PORT", "ENCODE_PORT", "PD_PORT", "PROXY_PORT"]
for env_key in port_keys:
if self.envs.get(env_key) in ["DEFAULT_PORT", None]:
self.envs[env_key] = str(get_open_port())
if self.prompts is None:
self.prompts = PROMPTS
if self.api_keyword_args is None:
self.api_keyword_args = API_KEYWORD_ARGS
if self.benchmarks is None:
self.benchmarks = {}
if self.test_content is None:
self.test_content = []
self.server_cmd = self._expand_values(self.server_cmd or [], self.envs)
self.epd_server_cmds = [self._expand_values(cmd, self.envs) for cmd in self.epd_server_cmds]
self.epd_proxy_args = self._expand_values(self.epd_proxy_args or [], self.envs)
for key, value in self.extra_config.items():
setattr(self, key, value)
@staticmethod
def _expand_values(values: list[str], envs: dict[str, Any]) -> list[str]:
"""Interpolate $VAR/${VAR} placeholders with provided env values."""
pattern = re.compile(r"\$(\w+)|\$\{(\w+)\}")
def repl(m: re.Match[str]) -> str:
key = m.group(1) or m.group(2)
return str(envs.get(key, m.group(0)))
return [pattern.sub(repl, str(arg)) for arg in values]
def _get_required_port(self, key: str) -> int:
value = self.envs.get(key)
if value is None:
raise ValueError(f"Missing required port env: {key}")
return int(value)
@property
def server_port(self) -> int:
return self._get_required_port("SERVER_PORT")
@property
def encode_port(self) -> int:
return self._get_required_port("ENCODE_PORT")
@property
def pd_port(self) -> int:
return self._get_required_port("PD_PORT")
@property
def proxy_port(self) -> int:
return self._get_required_port("PROXY_PORT")
class SingleNodeConfigLoader:
"""Load SingleNodeConfig from yaml file."""
DEFAULT_CONFIG_NAME = "Kimi-K2-Thinking.yaml"
STANDARD_CASE_FIELDS = {
"name",
"model",
"envs",
"prompts",
"api_keyword_args",
"benchmarks",
"service_mode",
"server_cmd",
"server_cmd_extra",
"test_content",
"epd_server_cmds",
"epd_proxy_args",
}
@classmethod
def from_yaml_cases(cls, yaml_path: str | None = None) -> list[SingleNodeConfig]:
config = cls._load_yaml(yaml_path)
if "test_cases" not in config:
raise KeyError("test_cases field is required in config yaml")
cases = config.get("test_cases")
if not isinstance(cases, list):
raise TypeError("test_cases must be a list")
cls._validate_para(cases)
return cls._parse_test_cases(cases)
@classmethod
def _load_yaml(cls, yaml_path: str | None) -> dict[str, Any]:
if not yaml_path:
yaml_path = os.getenv("CONFIG_YAML_PATH", cls.DEFAULT_CONFIG_NAME)
full_path = os.path.join(CONFIG_BASE_PATH, yaml_path)
logger.info("Loading config yaml: %s", full_path)
with open(full_path) as f:
return yaml.safe_load(f)
@staticmethod
def _validate_para(cases: list[dict[str, Any]]) -> None:
if not cases:
raise ValueError("test_cases is empty")
for case in cases:
mode = case.get("service_mode", "openai")
required = ["name", "model", "envs"]
if mode == "epd":
required.extend(["epd_server_cmds", "epd_proxy_args"])
else:
required.append("server_cmd")
missing = [k for k in required if k not in case]
if missing:
raise KeyError(f"Missing required config fields: {missing}")
if not isinstance(case["name"], str) or not case["name"].strip():
raise ValueError("test case field 'name' must be a non-empty string")
@classmethod
def _parse_test_cases(cls, cases: list[dict[str, Any]]) -> list[SingleNodeConfig]:
result: list[SingleNodeConfig] = []
for case in cases:
server_cmd = case.get("server_cmd", [])
server_cmd_extra = case.get("server_cmd_extra", [])
full_cmd = list(server_cmd) + list(server_cmd_extra)
extra_case_fields = {key: value for key, value in case.items() if key not in cls.STANDARD_CASE_FIELDS}
# Safe parsing mapping
result.append(
SingleNodeConfig(
name=case["name"],
model=case["model"],
envs=case.get("envs", {}),
server_cmd=full_cmd,
epd_server_cmds=case.get("epd_server_cmds", []),
epd_proxy_args=case.get("epd_proxy_args", []),
benchmarks=case.get("benchmarks", {}),
prompts=case.get("prompts", PROMPTS),
api_keyword_args=case.get("api_keyword_args", API_KEYWORD_ARGS),
test_content=case.get("test_content", ["completion"]),
service_mode=case.get("service_mode", "openai"),
extra_config=extra_case_fields,
)
)
return result

View File

@@ -0,0 +1,165 @@
import logging
from typing import Any
import openai
import pytest
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer, RemoteOpenAIServer
from tests.e2e.nightly.single_node.models.scripts.single_node_config import (
SingleNodeConfig,
SingleNodeConfigLoader,
)
from tools.aisbench import run_aisbench_cases
logger = logging.getLogger(__name__)
configs = SingleNodeConfigLoader.from_yaml_cases()
async def run_completion_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
client = server.get_async_client()
batch = await client.completions.create(
model=config.model,
prompt=config.prompts,
**config.api_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
async def run_image_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
from tools.send_mm_request import send_image_request
send_image_request(config.model, server)
async def run_chat_completion_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
from tools.send_request import send_v1_chat_completions
send_v1_chat_completions(
config.prompts[0],
model=config.model,
server=server,
request_args=config.api_keyword_args,
)
def run_benchmark_comparisons(config: SingleNodeConfig, results: Any) -> None:
"""General assertion engine for aisbench outcomes mapped directly from YAML."""
comparisons = config.extra_config.get("benchmark_comparisons_args", [])
if not comparisons:
return
# Valid task keys defined in benchmarks mapping
valid_keys = [k for k, v in config.benchmarks.items() if v]
metrics_cache = {}
for comp in comparisons:
metric = comp.get("metric", "TTFT")
baseline_key = comp.get("baseline")
target_key = comp.get("target")
ratio = comp.get("ratio", 1.0)
op = comp.get("operator", "<")
if not baseline_key or not target_key:
logger.warning("Invalid comparison config: missing baseline or target. %s", comp)
continue
if metric not in metrics_cache:
if metric == "TTFT":
from tools.aisbench import get_TTFT
# map TTFT outputs directly to their corresponding benchmark test case names
metrics_cache[metric] = dict(zip(valid_keys, get_TTFT(results)))
else:
logger.warning("Unsupported metric for comparison: %s", metric)
continue
metric_dict = metrics_cache[metric]
baseline_val = metric_dict.get(baseline_key)
target_val = metric_dict.get(target_key)
if baseline_val is None or target_val is None:
logger.warning("Missing data to compare %s and %s in metrics: %s", baseline_key, target_key, metric_dict)
continue
expected_threshold = baseline_val * ratio
eval_str = f"metric {metric}: {target_key}({target_val}) {op} {baseline_key}({baseline_val}) * {ratio}"
if op == "<":
assert target_val < expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
elif op == ">":
assert target_val > expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
elif op == "<=":
assert target_val <= expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
elif op == ">=":
assert target_val >= expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
else:
logger.warning("Unsupported comparison operator: %s", op)
continue
print(f"✅ Comparison passed: {eval_str} [threshold: {expected_threshold}]")
# Extend this dictionary to add new test capabilities
TEST_HANDLERS = {
"completion": run_completion_test,
"image": run_image_test,
"chat_completion": run_chat_completion_test,
}
async def _dispatch_tests(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
"""Dispatches requested tests defined in yaml."""
for test_name in config.test_content:
if test_name == "benchmark_comparisons":
continue
handler = TEST_HANDLERS.get(test_name)
if handler:
await handler(config, server)
else:
logger.warning("No handler registered for test content type: %s", test_name)
def _run_benchmarks(config: SingleNodeConfig, port: int) -> None:
"""Run Aisbench benchmarks and process benchmark-dependent custom assertions."""
aisbench_cases = [v for v in config.benchmarks.values() if v]
if not aisbench_cases:
return
result = run_aisbench_cases(
model=config.model,
port=port,
aisbench_cases=aisbench_cases,
)
if "benchmark_comparisons" in config.test_content:
run_benchmark_comparisons(config, result)
@pytest.mark.asyncio
@pytest.mark.parametrize("config", configs, ids=[config.name for config in configs])
async def test_single_node(config: SingleNodeConfig) -> None:
if config.service_mode == "epd":
with (
RemoteEPDServer(vllm_serve_args=config.epd_server_cmds, env_dict=config.envs) as _,
DisaggEpdProxy(proxy_args=config.epd_proxy_args, env_dict=config.envs) as proxy,
):
await _dispatch_tests(config, proxy)
_run_benchmarks(config, proxy.port)
return
# Standard OpenAI service mode
with RemoteOpenAIServer(
model=config.model,
vllm_serve_args=config.server_cmd,
server_port=config.server_port,
env_dict=config.envs,
auto_port=False,
) as server:
await _dispatch_tests(config, server)
_run_benchmarks(config, config.server_port)

View File

@@ -1,118 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"vllm-ascend/DeepSeek-R1-0528-W8A8",
]
MODES = [
"single",
"aclgraph",
]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 400,
"max_out_len": 1500,
"batch_size": 1000,
"baseline": 1,
"threshold": 0.97
}]
def config():
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "10",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True"
}
speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
additional_config = {"enable_weight_nz_layout": True}
server_args = [
"--quantization", "ascend", "--data-parallel-size", "2",
"--tensor-parallel-size", "8", "--enable-expert-parallel", "--port",
str(port), "--seed", "1024", "--max-model-len", "36864",
"--max-num-batched-tokens", "4096", "--max-num-seqs", "16",
"--trust-remote-code", "--gpu-memory-utilization", "0.9",
"--speculative-config",
json.dumps(speculative_config)
]
return port, env_dict, additional_config, server_args
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
port, env_dict, additional_config, server_args = config()
if mode == "single":
server_args.append("--enforce-eager")
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
if mode in ["single"]:
return
# aisbench test
run_aisbench_cases(model,
port,
aisbench_cases,
server_args=server_args)

View File

@@ -1,82 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
from .test_deepseek_r1_0528_w8a8 import *
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models_eplb(model: str) -> None:
port, env_dict, additional_config, server_args = config()
additional_config.update(
{
"eplb_config": {
"dynamic_eplb": "true",
"expert_heat_collection_interval": 1000,
"algorithm_execution_interval": 50,
"eplb_policy_type": 3,
}
}
)
env_dict.update(
{
"DYNAMIC_EPLB": "true",
}
)
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
# aisbench test
run_aisbench_cases(model,
port,
aisbench_cases,
server_args=server_args)

View File

@@ -1,123 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"vllm-ascend/DeepSeek-R1-W8A8",
]
MODES = [
"single",
]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 6000,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 32,
"max_out_len": 1500,
"batch_size": 32,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
port = get_open_port()
env_dict = {
"HCCL_BUFFSIZE": "1024",
}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": False,
"enable_multistream_shared_expert": False
}
}
server_args = [
"--quantization", "ascend", "--port",
str(port), "--data-parallel-size", "8", "--data-parallel-size-local",
"8", "--data-parallel-rpc-port", "13389", "--tensor-parallel-size",
"2", "--enable-expert-parallel", "--seed", "1024", "--max-num-seqs",
"32", "--max-model-len", "6000", "--max-num-batched-tokens", "6000",
"--trust-remote-code", "--gpu-memory-utilization", "0.92",
"--no-enable-prefix-caching", "--reasoning-parser", "deepseek_r1"
]
if mode == "single":
server_args.append("--enforce-eager")
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
# aisbench test
if mode in ["single"]:
return
run_aisbench_cases(model,
port,
aisbench_cases,
server_args=server_args)

View File

@@ -1,122 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = ["vllm-ascend/DeepSeek-V3.2-W8A8"]
TENSOR_PARALLELS = [8]
DATA_PARALLELS = [2]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 4096,
"batch_size": 8,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 1,
"max_out_len": 1500,
"batch_size": 1,
"request_rate": 11.2,
"baseline": 134,
"threshold": 0.97
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 100,
"max_out_len": 1500,
"batch_size": 4,
"request_rate": 11.2,
"baseline": 134,
"threshold": 0.97
}
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
async def test_models(model: str, tp_size: int, dp_size: int) -> None:
port = get_open_port()
env_dict = {
"HCCL_OP_EXPANSION_MODE": "AIV",
"OMP_PROC_BIND": "false",
"OMP_NUM_THREADS": "1",
"HCCL_BUFFSIZE": "1024",
"VLLM_ASCEND_ENABLE_MLAPO": "1",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1",
"VLLM_ENGINE_READY_TIMEOUT_S": "1800"
}
server_args = [
"--enable-expert-parallel", "--tensor-parallel-size",
str(tp_size), "--data-parallel-size",
str(dp_size), "--port",
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
"8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization",
"ascend", "--gpu-memory-utilization", "0.98", "--compilation-config",
'{"cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48], "cudagraph_mode":"FULL_DECODE_ONLY"}',
"--speculative-config",
'{"num_speculative_tokens": 3, "method":"deepseek_mtp"}',
"--additional-config",
'{"layer_sharding": ["q_b_proj", "o_proj"]}',
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,115 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"ZhipuAI/GLM-4.5",
]
TENSOR_PARALLELS = [8]
DATA_PARALLELS = [2]
FULL_GRAPH = [True, False]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 4096,
"batch_size": 8,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 16,
"max_out_len": 1500,
"batch_size": 8,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
@pytest.mark.parametrize("full_graph", FULL_GRAPH)
async def test_models(model: str, tp_size: int, dp_size: int,
full_graph: bool) -> None:
port = get_open_port()
env_dict = {"HCCL_BUFFSIZE": "1024"}
server_args = [
"--no-enable-prefix-caching",
"--enable-expert-parallel",
"--tensor-parallel-size",
str(tp_size),
"--data-parallel-size",
str(dp_size),
"--port",
str(port),
"--max-model-len",
"8192",
"--max-num-batched-tokens",
"8192",
"--block-size",
"16",
"--trust-remote-code",
"--gpu-memory-utilization",
"0.9",
]
if full_graph:
server_args += [
"--compilation-config",
'{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,110 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"moonshotai/Kimi-K2-Thinking",
]
TENSOR_PARALLELS = [16]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 4096,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 512,
"max_out_len": 256,
"batch_size": 64,
"trust_remote_code": True,
"request_rate": 11.2,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"HCCL_BUFFSIZE": "1024",
"TASK_QUEUE_ENABLE": "1",
"OMP_PROC_BIND": "false",
"HCCL_OP_EXPANSION_MODE": "AIV",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True"
}
server_args = [
"--tensor-parallel-size",
str(tp_size),
"--port",
str(port),
"--max-model-len",
"8192",
"--max-num-batched-tokens",
"8192",
"--max-num-seqs",
"12",
"--gpu-memory-utilization",
"0.9",
"--trust-remote-code",
"--enable-expert-parallel",
"--no-enable-prefix-caching",
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,140 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"vllm-ascend/DeepSeek-R1-0528-W8A8",
]
MODES = ["mtp2", "mtp3"]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_gsm8k = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}]
aisbench_aime = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/aime2024",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "aime2024/aime2024_gen_0_shot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 86.67,
"threshold": 7
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "100",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024",
"VLLM_RPC_TIMEOUT": "3600000",
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
}
speculative_config = {"num_speculative_tokens": 2, "method": "mtp"}
compilation_config = {
"cudagraph_capture_sizes": [56],
"cudagraph_mode": "FULL_DECODE_ONLY"
}
server_args = [
"--quantization",
"ascend",
"--seed",
"1024",
"--no-enable-prefix-caching",
"--data-parallel-size",
"2",
"--tensor-parallel-size",
"8",
"--enable-expert-parallel",
"--port",
str(port),
"--max-model-len",
"40960",
"--max-num-seqs",
"14",
"--trust-remote-code",
]
if mode == "mtp2":
server_args.extend(["--max-num-batched-tokens", "4096"])
server_args.extend(
["--speculative-config",
json.dumps(speculative_config)])
server_args.extend(["--gpu-memory-utilization", "0.92"])
aisbench_cases = aisbench_gsm8k
if mode == "mtp3":
env_dict["HCCL_OP_EXPANSION_MODE"] = "AIV"
server_args.extend(["--max-num-batched-tokens", "2048"])
speculative_config["num_speculative_tokens"] = 3
server_args.extend(
["--speculative-config",
json.dumps(speculative_config)])
server_args.extend(["--gpu-memory-utilization", "0.9"])
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
aisbench_cases = aisbench_aime
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
# aisbench test
run_aisbench_cases(model,
port,
aisbench_cases,
server_args=server_args)

View File

@@ -1,107 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import get_TTFT, run_aisbench_cases
MODELS = [
"vllm-ascend/DeepSeek-R1-0528-W8A8",
]
aisbench_warm_up = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in1024-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 2,
"batch_size": 1000,
"baseline": 0,
"threshold": 0.97
}]
aisbench_cases0 = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/prefix0-in3500-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 1500,
"batch_size": 18,
"baseline": 1,
"threshold": 0.97
}]
aisbench_cases75 = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/prefix75-in3500-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 1500,
"batch_size": 18,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "10",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
}
additional_config = {"enable_weight_nz_layout": True}
speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
server_args = [
"--quantization", "ascend", "--data-parallel-size", "2",
"--tensor-parallel-size", "8", "--enable-expert-parallel", "--port",
str(port), "--seed", "1024", "--max-model-len", "5200",
"--max-num-batched-tokens", "4096", "--max-num-seqs", "16",
"--trust-remote-code", "--gpu-memory-utilization", "0.9",
"--additional-config",
json.dumps(additional_config), "--speculative-config",
json.dumps(speculative_config)
]
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False):
run_aisbench_cases(model, port, aisbench_warm_up)
result = run_aisbench_cases(model, port, aisbench_cases0)
TTFT0 = get_TTFT(result)
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False):
run_aisbench_cases(model, port, aisbench_warm_up)
result = run_aisbench_cases(model, port, aisbench_cases75)
TTFT75 = get_TTFT(result)
assert TTFT75 < 0.8 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.8*TTFT for prefix0 {TTFT0}."
print(
f"The TTFT for prefix75 {TTFT75} is less than 0.8*TTFT for prefix0 {TTFT0}."
)

View File

@@ -1,99 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import get_TTFT, run_aisbench_cases
MODELS = [
"vllm-ascend/Qwen3-32B-W8A8",
]
aisbench_warm_up = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in1024-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 2,
"batch_size": 1000,
"baseline": 0,
"threshold": 0.97
}]
aisbench_cases0 = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/prefix0-in3500-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 1500,
"batch_size": 48,
"baseline": 1,
"threshold": 0.97
}]
aisbench_cases75 = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/prefix75-in3500-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 1500,
"batch_size": 48,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
additional_config = {"enable_weight_nz_layout": True}
server_args = [
"--quantization", "ascend", "--reasoning-parser", "qwen3",
"--tensor-parallel-size", "4", "--port",
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
"8192", "--max-num-seqs", "256", "--trust-remote-code",
"--gpu-memory-utilization", "0.9", "--additional-config",
json.dumps(additional_config)
]
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False):
run_aisbench_cases(model, port, aisbench_warm_up)
result = run_aisbench_cases(model, port, aisbench_cases0)
TTFT0 = get_TTFT(result)
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False):
run_aisbench_cases(model, port, aisbench_warm_up)
result = run_aisbench_cases(model, port, aisbench_cases75)
TTFT75 = get_TTFT(result)
assert TTFT75 < 0.8 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.8*TTFT for prefix0 {TTFT0}."
print(
f"The TTFT for prefix75 {TTFT75} is less than 0.8*TTFT for prefix0 {TTFT0}."
)

View File

@@ -1,110 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
from tools.send_mm_request import send_image_request
MODELS = [
"Qwen/Qwen2.5-VL-32B-Instruct",
]
TENSOR_PARALLELS = [4]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/textvqa-lite",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"max_out_len": 2048,
"batch_size": 128,
"baseline": 76.22,
"temperature": 0,
"top_k": -1,
"top_p": 1,
"repetition_penalty": 1,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"num_prompts": 512,
"max_out_len": 256,
"batch_size": 128,
"temperature": 0,
"top_k": -1,
"top_p": 1,
"repetition_penalty": 1,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
"VLLM_ASCEND_ENABLE_NZ": "0",
"HCCL_OP_EXPANSION_MODE": "AIV"
}
server_args = [
"--no-enable-prefix-caching", "--mm-processor-cache-gb", "0",
"--tensor-parallel-size",
str(tp_size), "--port",
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
"40000", "--max-num-seqs", "400", "--trust-remote-code",
"--gpu-memory-utilization", "0.8", "--compilation_config",
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
send_image_request(model, server)
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,102 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
from tools.send_mm_request import send_image_request
MODELS = [
"Qwen/Qwen2.5-VL-7B-Instruct",
]
TENSOR_PARALLELS = [4]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/textvqa-lite",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"max_out_len": 2048,
"batch_size": 128,
"baseline": 82.05,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"num_prompts": 512,
"max_out_len": 256,
"batch_size": 128,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
"VLLM_ASCEND_ENABLE_NZ": "0",
"HCCL_OP_EXPANSION_MODE": "AIV"
}
server_args = [
"--no-enable-prefix-caching", "--mm-processor-cache-gb", "0",
"--tensor-parallel-size",
str(tp_size), "--port",
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
"40000", "--max-num-seqs", "400", "--trust-remote-code",
"--gpu-memory-utilization", "0.8", "--compilation_config",
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
send_image_request(model, server)
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,110 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"Qwen/Qwen2.5-VL-7B-Instruct",
]
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
TENSOR_PARALLELS = [1]
warmup_cases = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"num_prompts": 50,
"max_out_len": 20,
"batch_size": 32,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/textvqa-lite",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"max_out_len": 2048,
"batch_size": 128,
"baseline": 82.05,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "textvqa/textvqa_gen_base64",
"num_prompts": 512,
"max_out_len": 256,
"batch_size": 128,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
encode_port = get_open_port()
pd_port = get_open_port()
vllm_server_args = [
[
"--port",
str(encode_port), "--model", model, "--gpu-memory-utilization",
"0.01", "--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
"--max-num-seqs", "1", "--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
],
[
"--port",
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
"--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
"--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
]
]
proxy_port = get_open_port()
proxy_args = [
"--host", "127.0.0.1", "--port",
str(proxy_port), "--encode-servers-urls",
f"http://localhost:{encode_port}", "--decode-servers-urls",
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
]
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
with DisaggEpdProxy(proxy_args=proxy_args) as _:
# warm up
run_aisbench_cases(model=model,
port=proxy_port,
aisbench_cases=warmup_cases)
# aisbench test
run_aisbench_cases(model, proxy_port, aisbench_cases)

View File

@@ -1,71 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
from .test_qwen3_235b_w8a8 import *
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models_eplb(model: str) -> None:
port, aisbench_cases, env_dict, compilation_config, server_args = config()
env_dict.update(
{
"DYNAMIC_EPLB": "true",
}
)
additional_config: dict[str, Any] = {}
additional_config["eplb_config"] = {
"dynamic_eplb": "true",
"expert_heat_collection_interval": 600,
"algorithm_execution_interval": 50,
"num_redundant_experts": 16,
"eplb_policy_type": 2,
}
server_args.extend(["--additional-config", json.dumps(additional_config)])
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
# aisbench test
run_aisbench_cases(model,
port,
aisbench_cases,
server_args=server_args)

View File

@@ -1,104 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"vllm-ascend/Qwen3-235B-A22B-W8A8",
]
MODES = ["full_graph", "piecewise"]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
def config():
port = get_open_port()
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"top_k": 20,
"baseline": 95,
"threshold": 5
}]
env_dict = {
"OMP_NUM_THREADS": "10",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",
"--data-parallel-size", "4", "--tensor-parallel-size", "4",
"--enable-expert-parallel", "--port",
str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
"8192", "--max-num-seqs", "12", "--trust-remote-code",
"--gpu-memory-utilization", "0.9"
]
return port, aisbench_cases, env_dict, compilation_config, server_args
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
port, aisbench_cases, env_dict, compilation_config, server_args = config()
if mode == "piecewise":
compilation_config["cudagraph_mode"] = "PIECEWISE"
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
# aisbench test
run_aisbench_cases(model,
port,
aisbench_cases,
server_args=server_args)

View File

@@ -1,92 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"vllm-ascend/Qwen3-30B-A3B-W8A8",
]
TENSOR_PARALLELS = [1]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 180,
"max_out_len": 1500,
"batch_size": 45,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"OMP_PROC_BIND": "false",
"OMP_NUM_THREADS": "10",
"HCCL_BUFFSIZE": "1024",
"HCCL_OP_EXPANSION_MODE": "AIV",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True"
}
server_args = [
"--quantization", "ascend", "--async-scheduling",
"--no-enable-prefix-caching", "--tensor-parallel-size",
str(tp_size), "--port",
str(port), "--max-model-len", "5600", "--max-num-batched-tokens",
"16384", "--max-num-seqs", "100", "--trust-remote-code",
"--gpu-memory-utilization", "0.9", "--compilation-config",
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,99 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"Qwen/Qwen3-32B",
]
TENSOR_PARALLELS = [4]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 80,
"max_out_len": 1500,
"batch_size": 20,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
"OMP_PROC_BIND": "false",
"HCCL_OP_EXPANSION_MODE": "AIV",
"PAGED_ATTENTION_MASK_LEN": "5500"
}
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size",
str(tp_size), "--port",
str(port), "--max-model-len", "36864", "--max-num-batched-tokens",
"36864", "--block-size", "128", "--trust-remote-code",
"--gpu-memory-utilization", "0.9", "--additional-config",
'{"enable_weight_nz_layout":true}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,129 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json
import os
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"vllm-ascend/Qwen3-32B-W8A8",
]
MODES = [
"aclgraph",
"single",
]
TENSOR_PARALLELS = [4]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
batch_size_dict = {
"linux-aarch64-a2b3-4": 72,
"linux-aarch64-a3-4": 76,
}
VLLM_CI_RUNNER = os.getenv("VLLM_CI_RUNNER", "linux-aarch64-a2b3-4")
performance_batch_size = batch_size_dict.get(VLLM_CI_RUNNER, 1)
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/aime2024",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "aime2024/aime2024_gen_0_shot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 83.33,
"threshold": 7
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 4 * performance_batch_size,
"max_out_len": 1500,
"batch_size": performance_batch_size,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, mode: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
"HCCL_OP_EXPANSION_MODE": "AIV",
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
}
compilation_config = {
"cudagraph_mode":
"FULL_DECODE_ONLY",
"cudagraph_capture_sizes":
[1, 12, 16, 20, 24, 32, 48, 60, 64, 68, 72, 76, 80]
}
server_args = [
"--quantization", "ascend", "--no-enable-prefix-caching",
"--tensor-parallel-size",
str(tp_size), "--port",
str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
"40960", "--block-size", "128", "--trust-remote-code",
"--reasoning-parser", "qwen3", "--gpu-memory-utilization", "0.9",
"--async-scheduling", "--additional-config",
'{"weight_prefetch_config":{"enabled":true}}',
]
if mode == "single":
server_args.append("--enforce-eager")
if mode == "aclgraph":
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
if mode == "single":
return
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,98 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
from tools.send_request import send_v1_chat_completions
MODELS = [
"vllm-ascend/Qwen3-32B-W8A8",
]
TENSOR_PARALLELS = [4]
prompts = [
"9.11 and 9.8, which is greater?",
]
api_keyword_args = {
"chat_template_kwargs": {
"enable_thinking": True
},
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt",
"max_out_len": 10240,
"batch_size": 32,
"baseline": 96,
"threshold": 4
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 240,
"max_out_len": 1500,
"batch_size": 60,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"VLLM_USE": "1",
"TASK_QUEUE_ENABLE": "1",
"HCCL_OP_EXPANSION_MODE": "AIV",
"OMP_PROC_BIND": "false",
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1",
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
}
server_args = [
"--quantization", "ascend", "--tensor-parallel-size",
str(tp_size), "--port",
str(port), "--trust-remote-code", "--reasoning-parser", "qwen3",
"--distributed_executor_backend", "mp", "--gpu-memory-utilization",
"0.9", "--block-size", "128", "--max-num-seqs", "256",
"--enforce-eager", "--max-model-len", "35840",
"--max-num-batched-tokens", "35840", "--additional-config",
'{"enable_weight_nz_layout":true, "weight_prefetch_config":{"enabled": true}}',
"--compilation-config",
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
]
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
send_v1_chat_completions(prompts[0],
model,
server,
request_args=api_keyword_args)
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,111 +0,0 @@
import json
import os
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"Qwen/Qwen3-Next-80B-A3B-Instruct",
]
MODES = ["aclgraph"]
TENSOR_PARALLELS = [4]
MAX_NUM_BATCHED_TOKENS = [8192, 32768]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
batch_size_dict = {
"linux-aarch64-a2b3-4": 64,
"linux-aarch64-a3-4": 64,
}
VLLM_CI_RUNNER = os.getenv("VLLM_CI_RUNNER", "linux-aarch64-a2b3-4")
performance_batch_size = batch_size_dict.get(VLLM_CI_RUNNER, 1)
aisbench_cases = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 4 * performance_batch_size,
"max_out_len": 1500,
"batch_size": performance_batch_size,
"baseline": 1,
"threshold": 0.97
}, {
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"top_k": 20,
"baseline": 95,
"threshold": 5
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("max_num_batched_tokens", MAX_NUM_BATCHED_TOKENS)
async def test_models(model: str, mode: str, tp_size: int,
max_num_batched_tokens: int) -> None:
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "10",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
}
server_args = [
"--tensor-parallel-size",
str(tp_size),
"--port",
str(port),
"--max-model-len",
"40960",
"--max-num-batched-tokens",
str(max_num_batched_tokens),
"--trust-remote-code",
"--async-scheduling",
"--no-enable-prefix-caching",
"--enable-expert-parallel",
"--gpu-memory-utilization",
"0.8",
"--max-num-seqs",
"64",
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
if mode == "single":
return
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -1,104 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8",
]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "10",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024",
}
server_args = [
"--quantization",
"ascend",
"--async-scheduling",
"--no-enable-prefix-caching",
"--data-parallel-size",
"1",
"--tensor-parallel-size",
"4",
"--enable-expert-parallel",
"--port",
str(port),
"--max-model-len",
"40960",
"--max-num-batched-tokens",
"8192",
"--max-num-seqs",
"32",
"--trust-remote-code",
"--gpu-memory-utilization",
"0.65",
"--compilation-config",
'{"cudagraph_capture_sizes": [32]}',
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
print(choices)
# aisbench test
run_aisbench_cases(model,
port,
aisbench_cases,
server_args=server_args)

View File

@@ -1,115 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases
MODELS = [
"Qwen/QwQ-32B",
]
MODES = [
"aclgraph",
"single",
]
TENSOR_PARALLELS = [4]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 240,
"max_out_len": 1500,
"batch_size": 60,
"baseline": 1,
"threshold": 0.97
}]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, mode: str, tp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
"OMP_PROC_BIND": "false",
"HCCL_OP_EXPANSION_MODE": "AIV",
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
"VLLM_ASCEND_ENABLE_DEBSE_OPTIMIZE": "1"
}
server_args = [
"--tensor-parallel-size",
str(tp_size), "--port",
str(port), "--max-model-len", "36864", "--max-num-batched-tokens",
"36864", "--block-size", "128", "--trust-remote-code",
"--gpu-memory-utilization", "0.9", "--compilation_config",
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}',
"--reasoning-parser", "deepseek_r1", "--distributed_executor_backend",
"mp", "--additional-config", '{"weight_prefetch_config":{"enabled":true}}'
]
if mode == "single":
server_args.remove("--compilation_config")
server_args.remove(
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
)
server_args.append("--enforce-eager")
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
if mode == "single":
return
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)

View File

@@ -245,9 +245,11 @@ def run_aisbench_cases(model, port, aisbench_cases, server_args="", host_ip="loc
return aisbench_results
def get_TTFT(result):
TTFT = result[0][0].loc["TTFT", "Average"][:-3]
return float(TTFT)
def get_TTFT(results):
TTFT = []
for i in range(len(results)):
TTFT.append(float(results[i][0].loc["TTFT", "Average"][:-3]))
return TTFT
temp_dir = tempfile.gettempdir()