[Nightly][Refactor]Migrate nightly single-node model tests from .py to .yaml (#6503)
### What this PR does / why we need it? This PR refactors the nightly single-node model test by migrating test configurations from Python scripts to a more maintainable `YAML-based` format. | Original PR | Python (`.py`) | YAML (`.yaml`) | | :--- | :--- | :--- | | [#3568](https://github.com/vllm-project/vllm-ascend/pull/3568) | `test_deepseek_r1_0528_w8a8_eplb.py` | `DeepSeek-R1-0528-W8A8.yaml` | | [#3631](https://github.com/vllm-project/vllm-ascend/pull/3631) | `test_deepseek_r1_0528_w8a8.py` | `DeepSeek-R1-0528-W8A8.yaml` | | [#5874](https://github.com/vllm-project/vllm-ascend/pull/5874) | `test_deepseek_r1_w8a8_hbm.py` | `DeepSeek-R1-W8A8-HBM.yaml` | | [#3908](https://github.com/vllm-project/vllm-ascend/pull/3908) | `test_deepseek_v3_2_w8a8.py` | `DeepSeek-V3.2-W8A8.yaml` | | [#5682](https://github.com/vllm-project/vllm-ascend/pull/5682) | `test_kimi_k2_thinking.py` | `Kimi-K2-Thinking.yaml` | | [#4111](https://github.com/vllm-project/vllm-ascend/pull/4111) | `test_mtpx_deepseek_r1_0528_w8a8.py` | `MTPX-DeepSeek-R1-0528-W8A8.yaml` | | [#3733](https://github.com/vllm-project/vllm-ascend/pull/3733) | `test_prefix_cache_deepseek_r1_0528_w8a8.py` | `Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml` | | [#6543](https://github.com/vllm-project/vllm-ascend/pull/6543) | `test_qwen3_235b_w8a8.py` | `Qwen3-235B-A22B-W8A8.yaml` | | [#6543](https://github.com/vllm-project/vllm-ascend/pull/6543) | `test_qwen3_235b_a22b_w8a8_eplb.py` | `Qwen3-235B-A22B-W8A8.yaml` | | [#3973](https://github.com/vllm-project/vllm-ascend/pull/3973) | `test_qwen3_30b_w8a8.py` | `Qwen3-30B-A3B-W8A8.yaml` | | [#3541](https://github.com/vllm-project/vllm-ascend/pull/3541) | `test_qwen3_32b_int8.py` | `Qwen3-32B-Int8.yaml` | | [#3757](https://github.com/vllm-project/vllm-ascend/pull/3757) | `test_qwq_32b.py` | `QwQ-32B.yaml` | | [#5616](https://github.com/vllm-project/vllm-ascend/pull/5616) | `test_qwen3_next_w8a8.py` | `Qwen3-Next-80B-A3B-Instruct-W8A8.yaml` | | [#3541](https://github.com/vllm-project/vllm-ascend/pull/3541) | `test_qwen2_5_vl_7b.py` | `Qwen2.5-VL-7B-Instruct.yaml` | | [#5301](https://github.com/vllm-project/vllm-ascend/pull/5301) | `test_qwen2_5_vl_7b_epd.py` | `Qwen2.5-VL-7B-Instruct-EPD.yaml` | | [#3707](https://github.com/vllm-project/vllm-ascend/pull/3707) | `test_qwen2_5_vl_32b.py` | `Qwen2.5-VL-32B-Instruct.yaml` | | [#3676](https://github.com/vllm-project/vllm-ascend/pull/3676) | `test_qwen3_32b_int8_a3_feature_stack3.py` | `Qwen3-32B-Int8-A3-Feature-Stack3.yaml` | | [#3709](https://github.com/vllm-project/vllm-ascend/pull/3709) | `test_prefix_cache_qwen3_32b_int8.py` | `Prefix-Cache-Qwen3-32B-Int8.yaml` | | [#5395](https://github.com/vllm-project/vllm-ascend/pull/5395) | `test_qwen3_next.py` | `Qwen3-Next-80B-A3B-Instruct-A2.yaml` | | [#3474](https://github.com/vllm-project/vllm-ascend/pull/3474) | `test_qwen3_32b.py` | `Qwen3-32B.yaml` | | [#3541](https://github.com/vllm-project/vllm-ascend/pull/3541) | `test_qwen3_32b_int8.py` | `Qwen3-32B-Int8-A2.yaml` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
36
.github/workflows/_e2e_nightly_single_node.yaml
vendored
36
.github/workflows/_e2e_nightly_single_node.yaml
vendored
@@ -28,7 +28,10 @@ on:
|
||||
type: string
|
||||
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11"
|
||||
tests:
|
||||
required: true
|
||||
required: false
|
||||
type: string
|
||||
config_file_path:
|
||||
required: false
|
||||
type: string
|
||||
name:
|
||||
required: false
|
||||
@@ -44,12 +47,12 @@ defaults:
|
||||
# only cancel in-progress runs of the same workflow
|
||||
# and ignore the lint / 1 card / 4 cards test type
|
||||
concurrency:
|
||||
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }}
|
||||
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path || inputs.tests }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
e2e-nightly:
|
||||
name: ${{ inputs.tests }}
|
||||
name: ${{ inputs.name || inputs.config_file_path || inputs.tests }}
|
||||
runs-on: ${{ inputs.runner }}
|
||||
timeout-minutes: 600
|
||||
container:
|
||||
@@ -114,14 +117,33 @@ jobs:
|
||||
update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
|
||||
update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20
|
||||
|
||||
- name: Run vllm-project/vllm-ascend test
|
||||
- name: Validate Inputs
|
||||
run: |
|
||||
if [[ -z "${{ inputs.tests }}" && -z "${{ inputs.config_file_path }}" ]]; then
|
||||
echo "Error: Either 'tests' or 'config_file_path' must be provided."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Run Pytest (py-driven)
|
||||
if: ${{ inputs.tests != '' }}
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
||||
BENCHMARK_HOME: /vllm-workspace/vllm-ascend/benchmark
|
||||
working-directory: /vllm-workspace/vllm-ascend
|
||||
run: |
|
||||
# ignore test_dispatch_ffn_combine until the test is fixed
|
||||
pytest -sv ${{ inputs.tests }} \
|
||||
echo "Running pytest with tests path: ${{ inputs.tests }}"
|
||||
pytest -sv "${{ inputs.tests }}" \
|
||||
--ignore=tests/e2e/nightly/single_node/ops/singlecard_ops/test_fused_moe.py
|
||||
|
||||
- name: Run Pytest (YAML-driven)
|
||||
if: ${{ always() && inputs.config_file_path != '' }}
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
||||
CONFIG_YAML_PATH: ${{ inputs.config_file_path }}
|
||||
working-directory: /vllm-workspace/vllm-ascend
|
||||
run: |
|
||||
echo "Running YAML-driven test with config: ${{ inputs.config_file_path }}"
|
||||
pytest -sv tests/e2e/nightly/single_node/models/scripts/test_single_node.py
|
||||
|
||||
34
.github/workflows/schedule_nightly_test_a2.yaml
vendored
34
.github/workflows/schedule_nightly_test_a2.yaml
vendored
@@ -49,15 +49,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test_config:
|
||||
- name: qwen3-next
|
||||
os: linux-aarch64-a2b3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_next.py
|
||||
- name: qwen3-32b
|
||||
os: linux-aarch64-a2b3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b.py
|
||||
- name: qwen3-32b-in8-a2
|
||||
os: linux-aarch64-a2b3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py
|
||||
- name: test_custom_op
|
||||
os: linux-aarch64-a2b3-1
|
||||
tests: tests/e2e/nightly/single_node/ops/singlecard_ops
|
||||
@@ -71,10 +62,33 @@ jobs:
|
||||
name: ${{ matrix.test_config.name }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||
|
||||
single-node-yaml-tests:
|
||||
name: single-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test_config:
|
||||
- name: qwen3-32b
|
||||
os: linux-aarch64-a2b3-4
|
||||
config_file_path: Qwen3-32B.yaml
|
||||
- name: qwen3-next-80b-a3b-instruct
|
||||
os: linux-aarch64-a2b3-4
|
||||
config_file_path: Qwen3-Next-80B-A3B-Instruct-A2.yaml
|
||||
- name: qwen3-32b-int8
|
||||
os: linux-aarch64-a2b3-4
|
||||
config_file_path: Qwen3-32B-Int8-A2.yaml
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||
with:
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
name: ${{ matrix.test_config.name }}
|
||||
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
needs: single-node-tests
|
||||
needs: [single-node-tests, single-node-yaml-tests]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
|
||||
120
.github/workflows/schedule_nightly_test_a3.yaml
vendored
120
.github/workflows/schedule_nightly_test_a3.yaml
vendored
@@ -109,65 +109,11 @@ jobs:
|
||||
single-node-tests:
|
||||
name: single-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
needs: multi-node-tests
|
||||
needs: [multi-node-tests]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test_config:
|
||||
- name: qwen3-32b-in8-a3
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py
|
||||
- name: qwen3-32b-int8-a3-feature-stack3
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8_a3_feature_stack3.py
|
||||
- name: qwen3-235b-a22b-w8a8-eplb
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py
|
||||
- name: deepseek-r1-w8a8-eplb
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py
|
||||
- name: deepseek-r1-w8a8-mtpx
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_mtpx_deepseek_r1_0528_w8a8.py
|
||||
- name: qwen2-5-vl-7b
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b.py
|
||||
- name: qwen2-5-vl-7b-epd
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py
|
||||
- name: qwen2-5-vl-32b
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_32b.py
|
||||
- name: qwen3-32b-int8-prefix-cache
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_prefix_cache_qwen3_32b_int8.py
|
||||
- name: deepseek-r1-0528-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py
|
||||
- name: deepseek-r1-0528-w8a8-prefix-cache
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_prefix_cache_deepseek_r1_0528_w8a8.py
|
||||
- name: qwq-32b-a3
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwq_32b.py
|
||||
- name: qwen3-30b-w8a8
|
||||
os: linux-aarch64-a3-2
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_30b_w8a8.py
|
||||
- name: qwen3-235b-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py
|
||||
- name: qwen3-next-w8a8
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/nightly/single_node/models/test_qwen3_next_w8a8.py
|
||||
- name: kimi-k2-thinking
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py
|
||||
- name: deepseek-r1-w8a8-hbm
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py
|
||||
- name: deepseek3_2-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
|
||||
- name: qwen3-30b-acc
|
||||
os: linux-aarch64-a3-4
|
||||
tests: tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py
|
||||
@@ -178,6 +124,70 @@ jobs:
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
name: ${{ matrix.test_config.name }}
|
||||
|
||||
single-node-yaml-tests:
|
||||
name: single-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
needs: [multi-node-tests]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test_config:
|
||||
# YAML-driven tests
|
||||
- name: deepseek-r1-0528-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
config_file_path: DeepSeek-R1-0528-W8A8.yaml
|
||||
- name: deepseek-r1-w8a8-hbm
|
||||
os: linux-aarch64-a3-16
|
||||
config_file_path: DeepSeek-R1-W8A8-HBM.yaml
|
||||
- name: deepseek-v3-2-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
config_file_path: DeepSeek-V3.2-W8A8.yaml
|
||||
- name: kimi-k2-thinking
|
||||
os: linux-aarch64-a3-16
|
||||
config_file_path: Kimi-K2-Thinking.yaml
|
||||
- name: mtpx-deepseek-r1-0528-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
config_file_path: MTPX-DeepSeek-R1-0528-W8A8.yaml
|
||||
- name: qwen3-235b-a22b-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
config_file_path: Qwen3-235B-A22B-W8A8.yaml
|
||||
- name: qwen3-30b-a3b-w8a8
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Qwen3-30B-A3B-W8A8.yaml
|
||||
- name: qwen3-next-80b-a3b-instruct-w8a8
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Qwen3-Next-80B-A3B-Instruct-W8A8.yaml
|
||||
- name: qwq-32b
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: QwQ-32B.yaml
|
||||
- name: qwen3-32b-int8
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Qwen3-32B-Int8.yaml
|
||||
- name: qwen2-5-vl-7b
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Qwen2.5-VL-7B-Instruct.yaml
|
||||
- name: qwen2-5-vl-7b-epd
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Qwen2.5-VL-7B-Instruct-EPD.yaml
|
||||
- name: qwen2-5-vl-32b
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Qwen2.5-VL-32B-Instruct.yaml
|
||||
- name: qwen3-32b-int8-a3-feature-stack3
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Qwen3-32B-Int8-A3-Feature-Stack3.yaml
|
||||
- name: qwen3-32b-int8-prefix-cache
|
||||
os: linux-aarch64-a3-4
|
||||
config_file_path: Prefix-Cache-Qwen3-32B-Int8.yaml
|
||||
- name: deepseek-r1-0528-w8a8-prefix-cache
|
||||
os: linux-aarch64-a3-16
|
||||
config_file_path: Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||
with:
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
name: ${{ matrix.test_config.name }}
|
||||
|
||||
custom-ops-tests:
|
||||
name: test ops
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
OMP_NUM_THREADS: "10"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--data-parallel-size"
|
||||
- "2"
|
||||
- "--tensor-parallel-size"
|
||||
- "8"
|
||||
- "--enable-expert-parallel"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--seed"
|
||||
- "1024"
|
||||
- "--max-model-len"
|
||||
- "36864"
|
||||
- "--max-num-batched-tokens"
|
||||
- "4096"
|
||||
- "--max-num-seqs"
|
||||
- "16"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--speculative-config"
|
||||
- '{"num_speculative_tokens": 1, "method": "mtp"}'
|
||||
- "--additional-config"
|
||||
- '{"enable_weight_nz_layout": true}'
|
||||
|
||||
_benchmarks_acc: &benchmarks_acc
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
|
||||
_benchmarks_perf: &benchmarks_perf
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 400
|
||||
max_out_len: 1500
|
||||
batch_size: 1000
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "DeepSeek-R1-0528-W8A8-single"
|
||||
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--enforce-eager"
|
||||
benchmarks:
|
||||
|
||||
- name: "DeepSeek-R1-0528-W8A8-aclgraph"
|
||||
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
benchmarks:
|
||||
<<: *benchmarks_acc
|
||||
<<: *benchmarks_perf
|
||||
|
||||
- name: "DeepSeek-R1-0528-W8A8-EPLB"
|
||||
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
DYNAMIC_EPLB: "true"
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--additional-config"
|
||||
- '{"enable_weight_nz_layout": true, "eplb_config": {"dynamic_eplb": "true", "expert_heat_collection_interval": 1000, "algorithm_execution_interval": 50, "eplb_policy_type": 3}}'
|
||||
benchmarks:
|
||||
<<: *benchmarks_acc
|
||||
@@ -0,0 +1,42 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "DeepSeek-R1-W8A8-HBM-single"
|
||||
model: "vllm-ascend/DeepSeek-R1-W8A8"
|
||||
envs:
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--data-parallel-size"
|
||||
- "8"
|
||||
- "--data-parallel-size-local"
|
||||
- "8"
|
||||
- "--data-parallel-rpc-port"
|
||||
- "13389"
|
||||
- "--tensor-parallel-size"
|
||||
- "2"
|
||||
- "--enable-expert-parallel"
|
||||
- "--seed"
|
||||
- "1024"
|
||||
- "--max-num-seqs"
|
||||
- "32"
|
||||
- "--max-model-len"
|
||||
- "6000"
|
||||
- "--max-num-batched-tokens"
|
||||
- "6000"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.92"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--reasoning-parser"
|
||||
- "deepseek_r1"
|
||||
- "--enforce-eager"
|
||||
- "--additional-config"
|
||||
- '{"ascend_scheduler_config": {"enabled": false}, "torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}}'
|
||||
benchmarks:
|
||||
@@ -0,0 +1,78 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "DeepSeek-V3.2-W8A8-TP8-DP2"
|
||||
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
|
||||
envs:
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
OMP_PROC_BIND: "false"
|
||||
OMP_NUM_THREADS: "1"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
VLLM_ASCEND_ENABLE_MLAPO: "1"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM1: "1"
|
||||
VLLM_ENGINE_READY_TIMEOUT_S: "1800"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--enable-expert-parallel"
|
||||
- "--tensor-parallel-size"
|
||||
- "8"
|
||||
- "--data-parallel-size"
|
||||
- "2"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "8192"
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
- "--max-num-seqs"
|
||||
- "4"
|
||||
- "--trust-remote-code"
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.98"
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48], "cudagraph_mode":"FULL_DECODE_ONLY"}'
|
||||
- "--speculative-config"
|
||||
- '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
|
||||
- "--additional-config"
|
||||
- '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
||||
- "--reasoning-parser"
|
||||
- "deepseek_v3"
|
||||
- "--tokenizer_mode"
|
||||
- "deepseek_v32"
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 4096
|
||||
batch_size: 8
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
perf_1:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 1
|
||||
max_out_len: 1500
|
||||
batch_size: 1
|
||||
request_rate: 11.2
|
||||
baseline: 134
|
||||
threshold: 0.97
|
||||
perf_2:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 100
|
||||
max_out_len: 1500
|
||||
batch_size: 4
|
||||
request_rate: 11.2
|
||||
baseline: 134
|
||||
threshold: 0.97
|
||||
72
tests/e2e/nightly/single_node/models/configs/GLM-4.5.yaml
Normal file
72
tests/e2e/nightly/single_node/models/configs/GLM-4.5.yaml
Normal file
@@ -0,0 +1,72 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--enable-expert-parallel"
|
||||
- "--tensor-parallel-size"
|
||||
- "8"
|
||||
- "--data-parallel-size"
|
||||
- "2"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "8192"
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
- "--block-size"
|
||||
- "16"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 4096
|
||||
batch_size: 8
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 16
|
||||
max_out_len: 1500
|
||||
batch_size: 8
|
||||
request_rate: 0
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "GLM-4.5-TP8-DP2-fullgraph"
|
||||
model: "ZhipuAI/GLM-4.5"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "GLM-4.5-TP8-DP2-eager"
|
||||
model: "ZhipuAI/GLM-4.5"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
@@ -0,0 +1,52 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Kimi-K2-Thinking-TP16-Case"
|
||||
model: "moonshotai/Kimi-K2-Thinking"
|
||||
envs:
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--tensor-parallel-size"
|
||||
- "16"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "8192"
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
- "--max-num-seqs"
|
||||
- "12"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--trust-remote-code"
|
||||
- "--enable-expert-parallel"
|
||||
- "--no-enable-prefix-caching"
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 4096
|
||||
batch_size: 32
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 512
|
||||
max_out_len: 256
|
||||
batch_size: 64
|
||||
trust_remote_code: true
|
||||
request_rate: 11.2
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,90 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
OMP_NUM_THREADS: "100"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
VLLM_RPC_TIMEOUT: "3600000"
|
||||
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "3600000"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--seed"
|
||||
- "1024"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--data-parallel-size"
|
||||
- "2"
|
||||
- "--tensor-parallel-size"
|
||||
- "8"
|
||||
- "--enable-expert-parallel"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "40960"
|
||||
- "--max-num-seqs"
|
||||
- "14"
|
||||
- "--trust-remote-code"
|
||||
|
||||
_benchmarks_gsm8k: &benchmarks_gsm8k
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
|
||||
_benchmarks_aime: &benchmarks_aime
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/aime2024
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 86.67
|
||||
threshold: 7
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "MTPX-DeepSeek-R1-0528-W8A8-mtp2"
|
||||
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--max-num-batched-tokens"
|
||||
- "4096"
|
||||
- "--speculative-config"
|
||||
- '{"num_speculative_tokens": 2, "method": "mtp"}'
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.92"
|
||||
benchmarks:
|
||||
<<: *benchmarks_gsm8k
|
||||
|
||||
- name: "MTPX-DeepSeek-R1-0528-W8A8-mtp3"
|
||||
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--max-num-batched-tokens"
|
||||
- "2048"
|
||||
- "--speculative-config"
|
||||
- '{"num_speculative_tokens": 3, "method": "mtp"}'
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_capture_sizes": [56], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
benchmarks:
|
||||
<<: *benchmarks_aime
|
||||
@@ -0,0 +1,77 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "prefix-cache-deepseek-r1-0528-w8a8"
|
||||
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
envs:
|
||||
OMP_NUM_THREADS: "10"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--data-parallel-size"
|
||||
- "2"
|
||||
- "--tensor-parallel-size"
|
||||
- "8"
|
||||
- "--enable-expert-parallel"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--seed"
|
||||
- "1024"
|
||||
- "--max-model-len"
|
||||
- "5200"
|
||||
- "--max-num-batched-tokens"
|
||||
- "4096"
|
||||
- "--max-num-seqs"
|
||||
- "16"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--additional-config"
|
||||
- '{"enable_weight_nz_layout": true}'
|
||||
- "--speculative-config"
|
||||
- '{"num_speculative_tokens": 1, "method": "mtp"}'
|
||||
test_content:
|
||||
- "benchmark_comparisons"
|
||||
benchmark_comparisons_args:
|
||||
- metric: "TTFT"
|
||||
baseline: "prefix0"
|
||||
target: "prefix75"
|
||||
ratio: 0.8
|
||||
operator: "<"
|
||||
benchmarks:
|
||||
warm_up:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in1024-bs210
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 210
|
||||
max_out_len: 2
|
||||
batch_size: 1000
|
||||
baseline: 0
|
||||
threshold: 0.97
|
||||
prefix0:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/prefix0-in3500-bs210
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 210
|
||||
max_out_len: 1500
|
||||
batch_size: 18
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
prefix75:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/prefix75-in3500-bs210
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 210
|
||||
max_out_len: 1500
|
||||
batch_size: 18
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,70 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "prefix-cache-qwen3-32b-w8a8"
|
||||
model: "vllm-ascend/Qwen3-32B-W8A8"
|
||||
envs:
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--reasoning-parser"
|
||||
- "qwen3"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "8192"
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
- "--max-num-seqs"
|
||||
- "256"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--additional-config"
|
||||
- '{"enable_weight_nz_layout": true}'
|
||||
test_content:
|
||||
- "benchmark_comparisons"
|
||||
benchmark_comparisons_args:
|
||||
- metric: "TTFT"
|
||||
baseline: "prefix0"
|
||||
target: "prefix75"
|
||||
ratio: 0.8
|
||||
operator: "<"
|
||||
benchmarks:
|
||||
warm_up:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in1024-bs210
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 210
|
||||
max_out_len: 2
|
||||
batch_size: 1000
|
||||
baseline: 0
|
||||
threshold: 0.97
|
||||
prefix0:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/prefix0-in3500-bs210
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 210
|
||||
max_out_len: 1500
|
||||
batch_size: 48
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
prefix75:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/prefix75-in3500-bs210
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 210
|
||||
max_out_len: 1500
|
||||
batch_size: 48
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
78
tests/e2e/nightly/single_node/models/configs/QwQ-32B.yaml
Normal file
78
tests/e2e/nightly/single_node/models/configs/QwQ-32B.yaml
Normal file
@@ -0,0 +1,78 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
|
||||
VLLM_ASCEND_ENABLE_DEBSE_OPTIMIZE: "1"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "36864"
|
||||
- "--max-num-batched-tokens"
|
||||
- "36864"
|
||||
- "--block-size"
|
||||
- "128"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--reasoning-parser"
|
||||
- "deepseek_r1"
|
||||
- "--distributed_executor_backend"
|
||||
- "mp"
|
||||
- "--additional-config"
|
||||
- '{"weight_prefetch_config":{"enabled":true}}'
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 240
|
||||
max_out_len: 1500
|
||||
batch_size: 60
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "QwQ-32B-aclgraph"
|
||||
model: "Qwen/QwQ-32B"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--compilation_config"
|
||||
- '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "QwQ-32B-single"
|
||||
model: "Qwen/QwQ-32B"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--enforce-eager"
|
||||
benchmarks:
|
||||
@@ -0,0 +1,63 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen2.5-VL-32B-Instruct-a3"
|
||||
model: "Qwen/Qwen2.5-VL-32B-Instruct"
|
||||
envs:
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
VLLM_ASCEND_ENABLE_NZ: "0"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--mm-processor-cache-gb"
|
||||
- "0"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "30000"
|
||||
- "--max-num-batched-tokens"
|
||||
- "40000"
|
||||
- "--max-num-seqs"
|
||||
- "400"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.8"
|
||||
- "--compilation_config"
|
||||
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
test_content:
|
||||
- "completion"
|
||||
- "image"
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/textvqa-lite
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
max_out_len: 2048
|
||||
batch_size: 128
|
||||
baseline: 76.22
|
||||
temperature: 0
|
||||
top_k: -1
|
||||
top_p: 1
|
||||
repetition_penalty: 1
|
||||
threshold: 5
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/textvqa-perf-1080p
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
num_prompts: 512
|
||||
max_out_len: 256
|
||||
batch_size: 128
|
||||
temperature: 0
|
||||
top_k: -1
|
||||
top_p: 1
|
||||
repetition_penalty: 1
|
||||
request_rate: 0
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,92 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen2.5-VL-7B-Instruct-epd"
|
||||
model: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
service_mode: "epd"
|
||||
envs:
|
||||
ENCODE_PORT: "DEFAULT_PORT"
|
||||
PD_PORT: "DEFAULT_PORT"
|
||||
PROXY_PORT: "DEFAULT_PORT"
|
||||
epd_server_cmds:
|
||||
- - "--port"
|
||||
- "$ENCODE_PORT"
|
||||
- "--model"
|
||||
- "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.01"
|
||||
- "--tensor-parallel-size"
|
||||
- "1"
|
||||
- "--enforce-eager"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--max-model-len"
|
||||
- "10000"
|
||||
- "--max-num-batched-tokens"
|
||||
- "10000"
|
||||
- "--max-num-seqs"
|
||||
- "1"
|
||||
- "--ec-transfer-config"
|
||||
- '{"ec_connector_extra_config":{"shared_storage_path":"/dev/shm/epd/storage"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
|
||||
- - "--port"
|
||||
- "$PD_PORT"
|
||||
- "--model"
|
||||
- "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.95"
|
||||
- "--tensor-parallel-size"
|
||||
- "1"
|
||||
- "--enforce-eager"
|
||||
- "--max-model-len"
|
||||
- "10000"
|
||||
- "--max-num-batched-tokens"
|
||||
- "10000"
|
||||
- "--max-num-seqs"
|
||||
- "128"
|
||||
- "--ec-transfer-config"
|
||||
- '{"ec_connector_extra_config":{"shared_storage_path":"/dev/shm/epd/storage"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
|
||||
epd_proxy_args:
|
||||
- "--host"
|
||||
- "127.0.0.1"
|
||||
- "--port"
|
||||
- "$PROXY_PORT"
|
||||
- "--encode-servers-urls"
|
||||
- "http://localhost:$ENCODE_PORT"
|
||||
- "--decode-servers-urls"
|
||||
- "http://localhost:$PD_PORT"
|
||||
- "--prefill-servers-urls"
|
||||
- "disable"
|
||||
test_content:
|
||||
benchmarks:
|
||||
warm_up:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/textvqa-perf-1080p
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
num_prompts: 50
|
||||
max_out_len: 20
|
||||
batch_size: 32
|
||||
request_rate: 0
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/textvqa-lite
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
max_out_len: 2048
|
||||
batch_size: 128
|
||||
baseline: 82.05
|
||||
threshold: 5
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/textvqa-perf-1080p
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
num_prompts: 512
|
||||
max_out_len: 256
|
||||
batch_size: 128
|
||||
request_rate: 0
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,55 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen2.5-VL-7B-Instruct"
|
||||
model: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
envs:
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
VLLM_ASCEND_ENABLE_NZ: "0"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--mm-processor-cache-gb"
|
||||
- "0"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "30000"
|
||||
- "--max-num-batched-tokens"
|
||||
- "40000"
|
||||
- "--max-num-seqs"
|
||||
- "400"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.8"
|
||||
- "--compilation_config"
|
||||
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
test_content:
|
||||
- "completion"
|
||||
- "image"
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/textvqa-lite
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
max_out_len: 2048
|
||||
batch_size: 128
|
||||
baseline: 82.05
|
||||
threshold: 5
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/textvqa-perf-1080p
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
num_prompts: 512
|
||||
max_out_len: 256
|
||||
batch_size: 128
|
||||
request_rate: 0
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,85 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
OMP_NUM_THREADS: "10"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM1: "1"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--async-scheduling"
|
||||
- "--data-parallel-size"
|
||||
- "4"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--enable-expert-parallel"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "40960"
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
- "--max-num-seqs"
|
||||
- "12"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
top_k: 20
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-235B-A22B-W8A8-full_graph"
|
||||
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "Qwen3-235B-A22B-W8A8-piecewise"
|
||||
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_mode": "PIECEWISE"}'
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "Qwen3-235B-A22B-W8A8-EPLB"
|
||||
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
DYNAMIC_EPLB: "true"
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--additional-config"
|
||||
- '{"eplb_config": {"dynamic_eplb": "true", "expert_heat_collection_interval": 600, "algorithm_execution_interval": 50, "num_redundant_experts": 16, "eplb_policy_type": 2}}'
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
@@ -0,0 +1,46 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-30B-A3B-W8A8-TP1"
|
||||
model: "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
||||
envs:
|
||||
OMP_PROC_BIND: "false"
|
||||
OMP_NUM_THREADS: "10"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--async-scheduling"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--tensor-parallel-size"
|
||||
- "1"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "5600"
|
||||
- "--max-num-batched-tokens"
|
||||
- "16384"
|
||||
- "--max-num-seqs"
|
||||
- "100"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 180
|
||||
max_out_len: 1500
|
||||
batch_size: 45
|
||||
request_rate: 0
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,79 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "40960"
|
||||
- "--max-num-batched-tokens"
|
||||
- "40960"
|
||||
- "--block-size"
|
||||
- "128"
|
||||
- "--trust-remote-code"
|
||||
- "--reasoning-parser"
|
||||
- "qwen3"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--async-scheduling"
|
||||
- "--additional-config"
|
||||
- '{"weight_prefetch_config":{"enabled":true}}'
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/aime2024
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 83.33
|
||||
threshold: 7
|
||||
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 288
|
||||
max_out_len: 1500
|
||||
batch_size: 72
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-32B-W8A8-aclgraph-a2"
|
||||
model: "vllm-ascend/Qwen3-32B-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_mode":"FULL_DECODE_ONLY","cudagraph_capture_sizes":[1,12,16,20,24,32,48,60,64,68,72,76,80]}'
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "Qwen3-32B-W8A8-single-a2"
|
||||
model: "vllm-ascend/Qwen3-32B-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--enforce-eager"
|
||||
benchmarks:
|
||||
@@ -0,0 +1,69 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-32B-W8A8-a3-feature-stack3"
|
||||
model: "vllm-ascend/Qwen3-32B-W8A8"
|
||||
envs:
|
||||
VLLM_USE: "1"
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
OMP_PROC_BIND: "false"
|
||||
VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE: "1"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
prompts:
|
||||
- "9.11 and 9.8, which is greater?"
|
||||
api_keyword_args:
|
||||
chat_template_kwargs:
|
||||
enable_thinking: true
|
||||
server_cmd:
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--trust-remote-code"
|
||||
- "--reasoning-parser"
|
||||
- "qwen3"
|
||||
- "--distributed_executor_backend"
|
||||
- "mp"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--block-size"
|
||||
- "128"
|
||||
- "--max-num-seqs"
|
||||
- "256"
|
||||
- "--enforce-eager"
|
||||
- "--max-model-len"
|
||||
- "35840"
|
||||
- "--max-num-batched-tokens"
|
||||
- "35840"
|
||||
- "--additional-config"
|
||||
- '{"enable_weight_nz_layout":true, "weight_prefetch_config":{"enabled": true}}'
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
|
||||
test_content:
|
||||
- "chat_completion"
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt
|
||||
max_out_len: 10240
|
||||
batch_size: 32
|
||||
baseline: 96
|
||||
threshold: 4
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 240
|
||||
max_out_len: 1500
|
||||
batch_size: 60
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,78 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM: "1"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "40960"
|
||||
- "--max-num-batched-tokens"
|
||||
- "40960"
|
||||
- "--block-size"
|
||||
- "128"
|
||||
- "--trust-remote-code"
|
||||
- "--reasoning-parser"
|
||||
- "qwen3"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--async-scheduling"
|
||||
- "--additional-config"
|
||||
- '{"weight_prefetch_config":{"enabled":true}}'
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/aime2024
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 83.33
|
||||
threshold: 7
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 304
|
||||
max_out_len: 1500
|
||||
batch_size: 76
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-32B-W8A8-aclgraph-a3"
|
||||
model: "vllm-ascend/Qwen3-32B-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_mode":"FULL_DECODE_ONLY","cudagraph_capture_sizes":[1,12,16,20,24,32,48,60,64,68,72,76,80]}'
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "Qwen3-32B-W8A8-single-a3"
|
||||
model: "vllm-ascend/Qwen3-32B-W8A8"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--enforce-eager"
|
||||
benchmarks:
|
||||
51
tests/e2e/nightly/single_node/models/configs/Qwen3-32B.yaml
Normal file
51
tests/e2e/nightly/single_node/models/configs/Qwen3-32B.yaml
Normal file
@@ -0,0 +1,51 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-32B-TP4"
|
||||
model: "Qwen/Qwen3-32B"
|
||||
envs:
|
||||
TASK_QUEUE_ENABLE: "1"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
PAGED_ATTENTION_MASK_LEN: "5500"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "36864"
|
||||
- "--max-num-batched-tokens"
|
||||
- "36864"
|
||||
- "--block-size"
|
||||
- "128"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.9"
|
||||
- "--additional-config"
|
||||
- '{"enable_weight_nz_layout":true}'
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 80
|
||||
max_out_len: 1500
|
||||
batch_size: 20
|
||||
request_rate: 0
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
@@ -0,0 +1,75 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
OMP_NUM_THREADS: "10"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "40960"
|
||||
- "--trust-remote-code"
|
||||
- "--async-scheduling"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--enable-expert-parallel"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.8"
|
||||
- "--max-num-seqs"
|
||||
- "64"
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 256
|
||||
max_out_len: 1500
|
||||
batch_size: 64
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
top_k: 20
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-8192-a2"
|
||||
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-32768-a2"
|
||||
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--max-num-batched-tokens"
|
||||
- "32768"
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
@@ -0,0 +1,45 @@
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-Next-80B-A3B-Instruct-W8A8"
|
||||
model: "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"
|
||||
envs:
|
||||
OMP_NUM_THREADS: "10"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
server_cmd:
|
||||
- "--quantization"
|
||||
- "ascend"
|
||||
- "--async-scheduling"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--data-parallel-size"
|
||||
- "1"
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--enable-expert-parallel"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "40960"
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
- "--max-num-seqs"
|
||||
- "32"
|
||||
- "--trust-remote-code"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.65"
|
||||
- "--compilation-config"
|
||||
- '{"cudagraph_capture_sizes": [32]}'
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
@@ -0,0 +1,75 @@
|
||||
# ==========================================
|
||||
# Shared Configurations
|
||||
# ==========================================
|
||||
|
||||
_envs: &envs
|
||||
OMP_NUM_THREADS: "10"
|
||||
OMP_PROC_BIND: "false"
|
||||
HCCL_BUFFSIZE: "1024"
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--tensor-parallel-size"
|
||||
- "4"
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
- "--max-model-len"
|
||||
- "40960"
|
||||
- "--trust-remote-code"
|
||||
- "--async-scheduling"
|
||||
- "--no-enable-prefix-caching"
|
||||
- "--enable-expert-parallel"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.8"
|
||||
- "--max-num-seqs"
|
||||
- "64"
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 256
|
||||
max_out_len: 1500
|
||||
batch_size: 64
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 32
|
||||
top_k: 20
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
|
||||
# ==========================================
|
||||
# ACTUAL TEST CASES
|
||||
# ==========================================
|
||||
|
||||
test_cases:
|
||||
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-8192-a3"
|
||||
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--max-num-batched-tokens"
|
||||
- "8192"
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
|
||||
- name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-32768-a3"
|
||||
model: "Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--max-num-batched-tokens"
|
||||
- "32768"
|
||||
benchmarks:
|
||||
<<: *benchmarks
|
||||
@@ -0,0 +1,312 @@
|
||||
# vLLM-Ascend Single-Node E2E Test Developer Guide
|
||||
|
||||
This document is intended to help developers understand the architecture of the single-node E2E (End-to-End) testing framework in `vllm-ascend`, how to run test scripts, and how to add custom testing functionality by writing YAML configuration files and extending the code.
|
||||
|
||||
## 1. Test Architecture Overview
|
||||
|
||||
To achieve high readability, extensibility, and decoupling of configuration from code, the single-node E2E test adopts a **"YAML-driven + Dispatcher"** architectural structure.
|
||||
|
||||
It consists of the following core components:
|
||||
|
||||
* **Configuration Parser (`single_node_config.py`)**: Responsible for reading `models/configs/*.yaml` files and parsing them into a strongly-typed `@dataclass` (`SingleNodeConfig`) via `SingleNodeConfigLoader`, while handling regex replacement for environment variables.
|
||||
* **Service Manager Framework (`test_single_node.py` and `conftest.py`)**: Based on the `service_mode` (`openai` or `epd`), it utilizes context managers to safely start/stop server processes.
|
||||
* **Test Function Dispatcher (`TEST_HANDLERS` Registry)**: Specific test logic is encapsulated into independent functions and registered in the global `TEST_HANDLERS` dictionary.
|
||||
* **Performance Benchmarking (`_run_benchmarks`)**: Calls `aisbench` for performance and TTFT testing based on the `benchmarks` parameters in the YAML.
|
||||
|
||||
### 1.1 Key Files and Responsibilities
|
||||
|
||||
* `tests/e2e/nightly/single_node/models/scripts/single_node_config.py`
|
||||
* Defines `SingleNodeConfig` and `SingleNodeConfigLoader`
|
||||
* Loads YAML from `tests/e2e/nightly/single_node/models/configs/<CONFIG_YAML_PATH>`
|
||||
* Auto-assigns ports when `envs` contains `DEFAULT_PORT` / missing values
|
||||
* Expands `$VAR` / `${VAR}` placeholders inside commands via `_expand_values`
|
||||
|
||||
* `tests/e2e/nightly/single_node/models/scripts/test_single_node.py`
|
||||
* Declares `configs = SingleNodeConfigLoader.from_yaml_cases()` (loaded at import time)
|
||||
* `pytest.mark.parametrize("config", configs, ids=[config.name for config in configs])` runs one test per YAML case
|
||||
* Controls server lifecycle via context managers
|
||||
* Dispatches `test_content` to functions registered in `TEST_HANDLERS`
|
||||
* Runs `aisbench` and optional benchmark assertions
|
||||
|
||||
### 1.2 End-to-End Flow (High Level)
|
||||
|
||||
```txt
|
||||
pytest starts
|
||||
|
|
||||
v
|
||||
import tests/e2e/nightly/single_node/models/scripts/test_single_node.py
|
||||
|
|
||||
v
|
||||
configs = SingleNodeConfigLoader.from_yaml_cases()
|
||||
|
|
||||
v
|
||||
pytest parametrize("config", configs) # one config == one test case
|
||||
|
|
||||
v
|
||||
test_single_node(config)
|
||||
|
|
||||
+-----------------------------------------------+
|
||||
| Start service (depends on service_mode) |
|
||||
| |
|
||||
| openai: start one vLLM OpenAI-compatible |
|
||||
| service process |
|
||||
| epd: start (encode service + decode/PD |
|
||||
| service) + start proxy process |
|
||||
+-----------------------------------------------+
|
||||
|
|
||||
v
|
||||
Run test phases (test_content)
|
||||
|
|
||||
v
|
||||
Optional benchmarks (if benchmarks is configured)
|
||||
|
|
||||
v
|
||||
Shutdown all started processes
|
||||
|
||||
Notes:
|
||||
- One YAML file may contain multiple test_cases; pytest will run them one by one.
|
||||
- The framework is "YAML-driven": changes are typically done by editing YAML rather than editing Python code.
|
||||
```
|
||||
|
||||
### 1.3 Function Call Relationships (Dispatcher)
|
||||
|
||||
`test_content` is a list of “phases”. Each phase maps to one handler function.
|
||||
|
||||
```txt
|
||||
For each test_case:
|
||||
|
||||
test_content (list of phases)
|
||||
|
|
||||
v
|
||||
[Dispatcher]
|
||||
|
|
||||
+--> phase "completion" -> send completion request(s)
|
||||
|
|
||||
+--> phase "chat_completion" -> send chat completion request(s)
|
||||
|
|
||||
+--> phase "image" -> send multimodal image request(s)
|
||||
|
|
||||
\--> (extendable) add your own phase by registering a new handler
|
||||
|
||||
After phases:
|
||||
if benchmarks is configured -> run aisbench
|
||||
|
||||
Notes:
|
||||
- The dispatcher only controls "what to run"; service lifecycle is controlled by the service manager.
|
||||
- Phases are intentionally small & composable so you can reuse them across YAML cases.
|
||||
```
|
||||
|
||||
## 2. Running and Debugging Steps
|
||||
|
||||
### 2.1 Dependencies
|
||||
|
||||
Ensure you are in an NPU environment and have installed `pytest`, `pyyaml`, `openai`, and `aisbench`.
|
||||
|
||||
### 2.2 Local Execution
|
||||
|
||||
The framework uses the `CONFIG_YAML_PATH` environment variable to specify the configuration file.
|
||||
|
||||
```bash
|
||||
# Switch to the project root directory
|
||||
cd /vllm-workspace/vllm-ascend
|
||||
|
||||
# Run a specific yaml test
|
||||
export CONFIG_YAML_PATH="Qwen3-32B.yaml"
|
||||
pytest -sv tests/e2e/nightly/single_node/models/scripts/test_single_node.py
|
||||
```
|
||||
|
||||
### 2.3 Tips for Debugging
|
||||
|
||||
* Only run a subset of cases: `pytest -sv ... -k <keyword>` (matches case names in the report output)
|
||||
* Stop on first failure: `pytest -sv ... -x`
|
||||
* Keep server logs visible: use `-s` (already included in `-sv`) and increase log verbosity via standard Python logging configuration if needed.
|
||||
|
||||
## 3. How to Write YAML Configuration Files
|
||||
|
||||
### 3.1 File Location and Selection Rules
|
||||
|
||||
* YAML files live under: `tests/e2e/nightly/single_node/models/configs/`
|
||||
* Selected by env var: `CONFIG_YAML_PATH=<YourConfig>.yaml`
|
||||
* If not set, the loader uses `SingleNodeConfigLoader.DEFAULT_CONFIG_NAME`
|
||||
|
||||
### 3.2 Field Descriptions
|
||||
|
||||
| Field Name | Type | Required | Default Value | Description |
|
||||
| :--------------- | :--------- | :------- | :-------------- | :------------------------------------------------------------------ |
|
||||
| `test_cases` | list | **Yes** | - | List of test case objects |
|
||||
| `name` | string | **Yes** | - | Human-readable case ID shown in pytest output and logs |
|
||||
| `model` | string | **Yes** | - | Model name or local path |
|
||||
| `service_mode` | string | No | `openai` | Service mode: `openai` or `epd` (disaggregated) |
|
||||
| `envs` | map | **Yes** | `{}` | Environment variables for the server process |
|
||||
| `server_cmd` | list | Cond. | `[]` | vLLM startup arguments (Required for non-EPD) |
|
||||
| `server_cmd_extra` | list | No | `[]` | Extra vLLM startup arguments appended after `server_cmd` |
|
||||
| `prompts` | list | No | built-in default | Prompts for completion/chat tests |
|
||||
| `api_keyword_args` | map | No | built-in default | OpenAI API keyword args (e.g., `max_tokens`, sampling params) |
|
||||
| `test_content` | list | No | `["completion"]` | Test phases: `completion`, `chat_completion`, `image` etc. |
|
||||
| `benchmarks` | map | No | `{}` | Configuration for `aisbench` performance verification |
|
||||
| `epd_server_cmds`| list[list] | Cond. | `[]` | (EPD Only) Command arrays for starting dual Encode/Decode processes |
|
||||
| `epd_proxy_args` | list | Cond. | `[]` | (EPD Only) Startup arguments for the EPD routing gateway |
|
||||
|
||||
**Notes / Behaviors**
|
||||
|
||||
* `name` is mandatory and must be a non-empty string.
|
||||
* It is used directly as pytest case id (e.g., `test_single_node[DeepSeek-R1-0528-W8A8-single]`).
|
||||
* It is also printed in `[single-node][START]` marker for log navigation.
|
||||
|
||||
* `envs` (ports): the config object recognizes these keys: `SERVER_PORT`, `ENCODE_PORT`, `PD_PORT`, `PROXY_PORT`.
|
||||
* If a port key is missing or set to `DEFAULT_PORT`, it will be automatically filled with an available open port.
|
||||
* `$SERVER_PORT` / `${SERVER_PORT}` placeholders in commands will be expanded using `envs`.
|
||||
|
||||
* `server_cmd` vs `server_cmd_extra`:
|
||||
* YAML can define `server_cmd_extra` to append additional args after `server_cmd`.
|
||||
* The loader merges them into a single `server_cmd` list.
|
||||
|
||||
* Extra fields:
|
||||
* Any non-standard fields in a case are stored in `config.extra_config`.
|
||||
* This is how extension configs are passed through without changing the dataclass.
|
||||
|
||||
### 3.3 YAML Examples
|
||||
|
||||
#### Single-Case (similar to DeepSeek-R1-W8A8-HBM)
|
||||
|
||||
```yaml
|
||||
test_cases:
|
||||
- name: "<your-case-name>"
|
||||
model: "<model-repo-or-local-path>"
|
||||
|
||||
# Optional: The default values are as follows
|
||||
prompts:
|
||||
- "San Francisco is a"
|
||||
api_keyword_args:
|
||||
max_tokens: 10
|
||||
|
||||
envs:
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
# Add only what you need.
|
||||
|
||||
server_cmd:
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
# plus your vLLM serve args...
|
||||
|
||||
# Optional: omit -> defaults to ["completion"]
|
||||
test_content:
|
||||
- "chat_completion"
|
||||
|
||||
# Optional: leave empty if you don't run aisbench
|
||||
benchmarks:
|
||||
```
|
||||
|
||||
#### Multi-Case + Shared Anchors
|
||||
|
||||
```yaml
|
||||
_envs: &envs
|
||||
SERVER_PORT: "DEFAULT_PORT"
|
||||
# shared envs...
|
||||
|
||||
_server_cmd: &server_cmd
|
||||
- "--port"
|
||||
- "$SERVER_PORT"
|
||||
# shared vLLM serve args...
|
||||
|
||||
_benchmarks: &benchmarks
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 400
|
||||
max_out_len: 1500
|
||||
batch_size: 1000
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
|
||||
test_cases:
|
||||
- name: "case-a"
|
||||
model: "<model>"
|
||||
envs:
|
||||
<<: *envs
|
||||
DYNAMIC_EPLB: "true"
|
||||
# private envs...
|
||||
server_cmd: *server_cmd
|
||||
server_cmd_extra:
|
||||
- "--enforce-eager"
|
||||
benchmarks:
|
||||
|
||||
- name: "case-b"
|
||||
model: "<model>"
|
||||
envs:
|
||||
<<: *envs
|
||||
server_cmd: *server_cmd
|
||||
benchmarks:
|
||||
<<: *benchmarks_acc
|
||||
```
|
||||
|
||||
#### EPD / Disaggregated Case
|
||||
|
||||
```yaml
|
||||
test_cases:
|
||||
- name: "<your-epd-case>"
|
||||
model: "<model>"
|
||||
service_mode: "epd"
|
||||
envs:
|
||||
ENCODE_PORT: "DEFAULT_PORT"
|
||||
PD_PORT: "DEFAULT_PORT"
|
||||
PROXY_PORT: "DEFAULT_PORT"
|
||||
|
||||
epd_server_cmds:
|
||||
- ["--port", "$ENCODE_PORT", "--model", "<encode-model>"]
|
||||
- ["--port", "$PD_PORT", "--model", "<decode-model>"]
|
||||
|
||||
epd_proxy_args:
|
||||
- "--host"
|
||||
- "127.0.0.1"
|
||||
- "--port"
|
||||
- "$PROXY_PORT"
|
||||
- "--encode-servers-urls"
|
||||
- "http://localhost:$ENCODE_PORT"
|
||||
- "--decode-servers-urls"
|
||||
- "http://localhost:$PD_PORT"
|
||||
- "--prefill-servers-urls"
|
||||
- "disable"
|
||||
|
||||
test_content:
|
||||
- "chat_completion"
|
||||
```
|
||||
|
||||
## 4. How to Add Custom Tests (Extension)
|
||||
|
||||
### Step 1: Write your test logic in `test_single_node.py`
|
||||
|
||||
```python
|
||||
async def run_video_test(config: SingleNodeConfig, server: 'RemoteOpenAIServer | DisaggEpdProxy') -> None:
|
||||
client = server.get_async_client()
|
||||
# Your custom logic here...
|
||||
```
|
||||
|
||||
### Step 2: Register your function in `TEST_HANDLERS`
|
||||
|
||||
```python
|
||||
TEST_HANDLERS = {
|
||||
"completion": run_completion_test,
|
||||
"video": run_video_test, # Registered!
|
||||
}
|
||||
```
|
||||
|
||||
### Step 3: Enable in YAML
|
||||
|
||||
```yaml
|
||||
test_content:
|
||||
- "completion"
|
||||
- "video"
|
||||
```
|
||||
|
||||
## 5. Checklist (Before Submitting a New YAML)
|
||||
|
||||
* `test_cases` exists and is a list
|
||||
* Each case contains required fields for its `service_mode`
|
||||
* Common required: `name`, `model`, `envs`
|
||||
* `openai`: `server_cmd`
|
||||
* `epd`: `epd_server_cmds`, `epd_proxy_args`
|
||||
* Port envs are set to `DEFAULT_PORT` (or to explicit free ports)
|
||||
* If using `benchmarks`, ensure each benchmark case includes required aisbench fields (e.g., `case_type`, `dataset_path`, `request_conf`, `dataset_conf`, `max_out_len`, `batch_size`)
|
||||
16
tests/e2e/nightly/single_node/models/scripts/__init__.py
Normal file
16
tests/e2e/nightly/single_node/models/scripts/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
@@ -0,0 +1,183 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
CONFIG_BASE_PATH = "tests/e2e/nightly/single_node/models/configs"
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default prompts and API args fallback
|
||||
PROMPTS = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
API_KEYWORD_ARGS = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SingleNodeConfig:
|
||||
name: str
|
||||
model: str
|
||||
envs: dict[str, Any] = field(default_factory=dict)
|
||||
prompts: list[str] = field(default_factory=lambda: PROMPTS)
|
||||
api_keyword_args: dict[str, Any] = field(default_factory=lambda: API_KEYWORD_ARGS)
|
||||
benchmarks: dict[str, Any] = field(default_factory=dict)
|
||||
server_cmd: list[str] = field(default_factory=list)
|
||||
test_content: list[str] = field(default_factory=lambda: ["completion"])
|
||||
service_mode: str = "openai"
|
||||
epd_server_cmds: list[list[str]] = field(default_factory=list)
|
||||
epd_proxy_args: list[str] = field(default_factory=list)
|
||||
extra_config: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
port_keys = ["SERVER_PORT", "ENCODE_PORT", "PD_PORT", "PROXY_PORT"]
|
||||
for env_key in port_keys:
|
||||
if self.envs.get(env_key) in ["DEFAULT_PORT", None]:
|
||||
self.envs[env_key] = str(get_open_port())
|
||||
|
||||
if self.prompts is None:
|
||||
self.prompts = PROMPTS
|
||||
if self.api_keyword_args is None:
|
||||
self.api_keyword_args = API_KEYWORD_ARGS
|
||||
if self.benchmarks is None:
|
||||
self.benchmarks = {}
|
||||
if self.test_content is None:
|
||||
self.test_content = []
|
||||
|
||||
self.server_cmd = self._expand_values(self.server_cmd or [], self.envs)
|
||||
self.epd_server_cmds = [self._expand_values(cmd, self.envs) for cmd in self.epd_server_cmds]
|
||||
self.epd_proxy_args = self._expand_values(self.epd_proxy_args or [], self.envs)
|
||||
|
||||
for key, value in self.extra_config.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
@staticmethod
|
||||
def _expand_values(values: list[str], envs: dict[str, Any]) -> list[str]:
|
||||
"""Interpolate $VAR/${VAR} placeholders with provided env values."""
|
||||
pattern = re.compile(r"\$(\w+)|\$\{(\w+)\}")
|
||||
|
||||
def repl(m: re.Match[str]) -> str:
|
||||
key = m.group(1) or m.group(2)
|
||||
return str(envs.get(key, m.group(0)))
|
||||
|
||||
return [pattern.sub(repl, str(arg)) for arg in values]
|
||||
|
||||
def _get_required_port(self, key: str) -> int:
|
||||
value = self.envs.get(key)
|
||||
if value is None:
|
||||
raise ValueError(f"Missing required port env: {key}")
|
||||
return int(value)
|
||||
|
||||
@property
|
||||
def server_port(self) -> int:
|
||||
return self._get_required_port("SERVER_PORT")
|
||||
|
||||
@property
|
||||
def encode_port(self) -> int:
|
||||
return self._get_required_port("ENCODE_PORT")
|
||||
|
||||
@property
|
||||
def pd_port(self) -> int:
|
||||
return self._get_required_port("PD_PORT")
|
||||
|
||||
@property
|
||||
def proxy_port(self) -> int:
|
||||
return self._get_required_port("PROXY_PORT")
|
||||
|
||||
|
||||
class SingleNodeConfigLoader:
|
||||
"""Load SingleNodeConfig from yaml file."""
|
||||
|
||||
DEFAULT_CONFIG_NAME = "Kimi-K2-Thinking.yaml"
|
||||
STANDARD_CASE_FIELDS = {
|
||||
"name",
|
||||
"model",
|
||||
"envs",
|
||||
"prompts",
|
||||
"api_keyword_args",
|
||||
"benchmarks",
|
||||
"service_mode",
|
||||
"server_cmd",
|
||||
"server_cmd_extra",
|
||||
"test_content",
|
||||
"epd_server_cmds",
|
||||
"epd_proxy_args",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_yaml_cases(cls, yaml_path: str | None = None) -> list[SingleNodeConfig]:
|
||||
config = cls._load_yaml(yaml_path)
|
||||
|
||||
if "test_cases" not in config:
|
||||
raise KeyError("test_cases field is required in config yaml")
|
||||
|
||||
cases = config.get("test_cases")
|
||||
if not isinstance(cases, list):
|
||||
raise TypeError("test_cases must be a list")
|
||||
cls._validate_para(cases)
|
||||
|
||||
return cls._parse_test_cases(cases)
|
||||
|
||||
@classmethod
|
||||
def _load_yaml(cls, yaml_path: str | None) -> dict[str, Any]:
|
||||
if not yaml_path:
|
||||
yaml_path = os.getenv("CONFIG_YAML_PATH", cls.DEFAULT_CONFIG_NAME)
|
||||
|
||||
full_path = os.path.join(CONFIG_BASE_PATH, yaml_path)
|
||||
logger.info("Loading config yaml: %s", full_path)
|
||||
|
||||
with open(full_path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
@staticmethod
|
||||
def _validate_para(cases: list[dict[str, Any]]) -> None:
|
||||
if not cases:
|
||||
raise ValueError("test_cases is empty")
|
||||
for case in cases:
|
||||
mode = case.get("service_mode", "openai")
|
||||
required = ["name", "model", "envs"]
|
||||
if mode == "epd":
|
||||
required.extend(["epd_server_cmds", "epd_proxy_args"])
|
||||
else:
|
||||
required.append("server_cmd")
|
||||
missing = [k for k in required if k not in case]
|
||||
if missing:
|
||||
raise KeyError(f"Missing required config fields: {missing}")
|
||||
|
||||
if not isinstance(case["name"], str) or not case["name"].strip():
|
||||
raise ValueError("test case field 'name' must be a non-empty string")
|
||||
|
||||
@classmethod
|
||||
def _parse_test_cases(cls, cases: list[dict[str, Any]]) -> list[SingleNodeConfig]:
|
||||
result: list[SingleNodeConfig] = []
|
||||
for case in cases:
|
||||
server_cmd = case.get("server_cmd", [])
|
||||
server_cmd_extra = case.get("server_cmd_extra", [])
|
||||
full_cmd = list(server_cmd) + list(server_cmd_extra)
|
||||
extra_case_fields = {key: value for key, value in case.items() if key not in cls.STANDARD_CASE_FIELDS}
|
||||
|
||||
# Safe parsing mapping
|
||||
result.append(
|
||||
SingleNodeConfig(
|
||||
name=case["name"],
|
||||
model=case["model"],
|
||||
envs=case.get("envs", {}),
|
||||
server_cmd=full_cmd,
|
||||
epd_server_cmds=case.get("epd_server_cmds", []),
|
||||
epd_proxy_args=case.get("epd_proxy_args", []),
|
||||
benchmarks=case.get("benchmarks", {}),
|
||||
prompts=case.get("prompts", PROMPTS),
|
||||
api_keyword_args=case.get("api_keyword_args", API_KEYWORD_ARGS),
|
||||
test_content=case.get("test_content", ["completion"]),
|
||||
service_mode=case.get("service_mode", "openai"),
|
||||
extra_config=extra_case_fields,
|
||||
)
|
||||
)
|
||||
return result
|
||||
165
tests/e2e/nightly/single_node/models/scripts/test_single_node.py
Normal file
165
tests/e2e/nightly/single_node/models/scripts/test_single_node.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer, RemoteOpenAIServer
|
||||
from tests.e2e.nightly.single_node.models.scripts.single_node_config import (
|
||||
SingleNodeConfig,
|
||||
SingleNodeConfigLoader,
|
||||
)
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
configs = SingleNodeConfigLoader.from_yaml_cases()
|
||||
|
||||
async def run_completion_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=config.model,
|
||||
prompt=config.prompts,
|
||||
**config.api_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
|
||||
|
||||
async def run_image_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
|
||||
from tools.send_mm_request import send_image_request
|
||||
|
||||
send_image_request(config.model, server)
|
||||
|
||||
|
||||
async def run_chat_completion_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
|
||||
from tools.send_request import send_v1_chat_completions
|
||||
|
||||
send_v1_chat_completions(
|
||||
config.prompts[0],
|
||||
model=config.model,
|
||||
server=server,
|
||||
request_args=config.api_keyword_args,
|
||||
)
|
||||
|
||||
|
||||
def run_benchmark_comparisons(config: SingleNodeConfig, results: Any) -> None:
|
||||
"""General assertion engine for aisbench outcomes mapped directly from YAML."""
|
||||
|
||||
comparisons = config.extra_config.get("benchmark_comparisons_args", [])
|
||||
|
||||
if not comparisons:
|
||||
return
|
||||
|
||||
# Valid task keys defined in benchmarks mapping
|
||||
valid_keys = [k for k, v in config.benchmarks.items() if v]
|
||||
|
||||
metrics_cache = {}
|
||||
|
||||
for comp in comparisons:
|
||||
metric = comp.get("metric", "TTFT")
|
||||
baseline_key = comp.get("baseline")
|
||||
target_key = comp.get("target")
|
||||
ratio = comp.get("ratio", 1.0)
|
||||
op = comp.get("operator", "<")
|
||||
|
||||
if not baseline_key or not target_key:
|
||||
logger.warning("Invalid comparison config: missing baseline or target. %s", comp)
|
||||
continue
|
||||
|
||||
if metric not in metrics_cache:
|
||||
if metric == "TTFT":
|
||||
from tools.aisbench import get_TTFT
|
||||
|
||||
# map TTFT outputs directly to their corresponding benchmark test case names
|
||||
metrics_cache[metric] = dict(zip(valid_keys, get_TTFT(results)))
|
||||
else:
|
||||
logger.warning("Unsupported metric for comparison: %s", metric)
|
||||
continue
|
||||
|
||||
metric_dict = metrics_cache[metric]
|
||||
baseline_val = metric_dict.get(baseline_key)
|
||||
target_val = metric_dict.get(target_key)
|
||||
|
||||
if baseline_val is None or target_val is None:
|
||||
logger.warning("Missing data to compare %s and %s in metrics: %s", baseline_key, target_key, metric_dict)
|
||||
continue
|
||||
|
||||
expected_threshold = baseline_val * ratio
|
||||
|
||||
eval_str = f"metric {metric}: {target_key}({target_val}) {op} {baseline_key}({baseline_val}) * {ratio}"
|
||||
|
||||
if op == "<":
|
||||
assert target_val < expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
|
||||
elif op == ">":
|
||||
assert target_val > expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
|
||||
elif op == "<=":
|
||||
assert target_val <= expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
|
||||
elif op == ">=":
|
||||
assert target_val >= expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]"
|
||||
else:
|
||||
logger.warning("Unsupported comparison operator: %s", op)
|
||||
continue
|
||||
|
||||
print(f"✅ Comparison passed: {eval_str} [threshold: {expected_threshold}]")
|
||||
|
||||
|
||||
# Extend this dictionary to add new test capabilities
|
||||
TEST_HANDLERS = {
|
||||
"completion": run_completion_test,
|
||||
"image": run_image_test,
|
||||
"chat_completion": run_chat_completion_test,
|
||||
}
|
||||
|
||||
|
||||
async def _dispatch_tests(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None:
|
||||
"""Dispatches requested tests defined in yaml."""
|
||||
for test_name in config.test_content:
|
||||
if test_name == "benchmark_comparisons":
|
||||
continue
|
||||
|
||||
handler = TEST_HANDLERS.get(test_name)
|
||||
if handler:
|
||||
await handler(config, server)
|
||||
else:
|
||||
logger.warning("No handler registered for test content type: %s", test_name)
|
||||
|
||||
|
||||
def _run_benchmarks(config: SingleNodeConfig, port: int) -> None:
|
||||
"""Run Aisbench benchmarks and process benchmark-dependent custom assertions."""
|
||||
aisbench_cases = [v for v in config.benchmarks.values() if v]
|
||||
if not aisbench_cases:
|
||||
return
|
||||
|
||||
result = run_aisbench_cases(
|
||||
model=config.model,
|
||||
port=port,
|
||||
aisbench_cases=aisbench_cases,
|
||||
)
|
||||
|
||||
if "benchmark_comparisons" in config.test_content:
|
||||
run_benchmark_comparisons(config, result)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("config", configs, ids=[config.name for config in configs])
|
||||
async def test_single_node(config: SingleNodeConfig) -> None:
|
||||
if config.service_mode == "epd":
|
||||
with (
|
||||
RemoteEPDServer(vllm_serve_args=config.epd_server_cmds, env_dict=config.envs) as _,
|
||||
DisaggEpdProxy(proxy_args=config.epd_proxy_args, env_dict=config.envs) as proxy,
|
||||
):
|
||||
await _dispatch_tests(config, proxy)
|
||||
_run_benchmarks(config, proxy.port)
|
||||
return
|
||||
|
||||
# Standard OpenAI service mode
|
||||
with RemoteOpenAIServer(
|
||||
model=config.model,
|
||||
vllm_serve_args=config.server_cmd,
|
||||
server_port=config.server_port,
|
||||
env_dict=config.envs,
|
||||
auto_port=False,
|
||||
) as server:
|
||||
await _dispatch_tests(config, server)
|
||||
_run_benchmarks(config, config.server_port)
|
||||
@@ -1,118 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/DeepSeek-R1-0528-W8A8",
|
||||
]
|
||||
|
||||
MODES = [
|
||||
"single",
|
||||
"aclgraph",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 400,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 1000,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
def config():
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True"
|
||||
}
|
||||
speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
|
||||
additional_config = {"enable_weight_nz_layout": True}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--data-parallel-size", "2",
|
||||
"--tensor-parallel-size", "8", "--enable-expert-parallel", "--port",
|
||||
str(port), "--seed", "1024", "--max-model-len", "36864",
|
||||
"--max-num-batched-tokens", "4096", "--max-num-seqs", "16",
|
||||
"--trust-remote-code", "--gpu-memory-utilization", "0.9",
|
||||
"--speculative-config",
|
||||
json.dumps(speculative_config)
|
||||
]
|
||||
return port, env_dict, additional_config, server_args
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
port, env_dict, additional_config, server_args = config()
|
||||
if mode == "single":
|
||||
server_args.append("--enforce-eager")
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
if mode in ["single"]:
|
||||
return
|
||||
# aisbench test
|
||||
run_aisbench_cases(model,
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
@@ -1,82 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
from .test_deepseek_r1_0528_w8a8 import *
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
async def test_models_eplb(model: str) -> None:
|
||||
port, env_dict, additional_config, server_args = config()
|
||||
additional_config.update(
|
||||
{
|
||||
"eplb_config": {
|
||||
"dynamic_eplb": "true",
|
||||
"expert_heat_collection_interval": 1000,
|
||||
"algorithm_execution_interval": 50,
|
||||
"eplb_policy_type": 3,
|
||||
}
|
||||
}
|
||||
)
|
||||
env_dict.update(
|
||||
{
|
||||
"DYNAMIC_EPLB": "true",
|
||||
}
|
||||
)
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model,
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
|
||||
@@ -1,123 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/DeepSeek-R1-W8A8",
|
||||
]
|
||||
|
||||
MODES = [
|
||||
"single",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 6000,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 32,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 32,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
}
|
||||
|
||||
additional_config = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
"enable_multistream_shared_expert": False
|
||||
}
|
||||
}
|
||||
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--port",
|
||||
str(port), "--data-parallel-size", "8", "--data-parallel-size-local",
|
||||
"8", "--data-parallel-rpc-port", "13389", "--tensor-parallel-size",
|
||||
"2", "--enable-expert-parallel", "--seed", "1024", "--max-num-seqs",
|
||||
"32", "--max-model-len", "6000", "--max-num-batched-tokens", "6000",
|
||||
"--trust-remote-code", "--gpu-memory-utilization", "0.92",
|
||||
"--no-enable-prefix-caching", "--reasoning-parser", "deepseek_r1"
|
||||
]
|
||||
|
||||
if mode == "single":
|
||||
server_args.append("--enforce-eager")
|
||||
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
# aisbench test
|
||||
if mode in ["single"]:
|
||||
return
|
||||
run_aisbench_cases(model,
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
@@ -1,122 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = ["vllm-ascend/DeepSeek-V3.2-W8A8"]
|
||||
|
||||
TENSOR_PARALLELS = [8]
|
||||
DATA_PARALLELS = [2]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 4096,
|
||||
"batch_size": 8,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 1,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 1,
|
||||
"request_rate": 11.2,
|
||||
"baseline": 134,
|
||||
"threshold": 0.97
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 100,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 4,
|
||||
"request_rate": 11.2,
|
||||
"baseline": 134,
|
||||
"threshold": 0.97
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"OMP_NUM_THREADS": "1",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"VLLM_ASCEND_ENABLE_MLAPO": "1",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1",
|
||||
"VLLM_ENGINE_READY_TIMEOUT_S": "1800"
|
||||
}
|
||||
|
||||
server_args = [
|
||||
"--enable-expert-parallel", "--tensor-parallel-size",
|
||||
str(tp_size), "--data-parallel-size",
|
||||
str(dp_size), "--port",
|
||||
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
|
||||
"8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization",
|
||||
"ascend", "--gpu-memory-utilization", "0.98", "--compilation-config",
|
||||
'{"cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48], "cudagraph_mode":"FULL_DECODE_ONLY"}',
|
||||
"--speculative-config",
|
||||
'{"num_speculative_tokens": 3, "method":"deepseek_mtp"}',
|
||||
"--additional-config",
|
||||
'{"layer_sharding": ["q_b_proj", "o_proj"]}',
|
||||
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,115 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"ZhipuAI/GLM-4.5",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [8]
|
||||
DATA_PARALLELS = [2]
|
||||
FULL_GRAPH = [True, False]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 4096,
|
||||
"batch_size": 8,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 16,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 8,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||
@pytest.mark.parametrize("full_graph", FULL_GRAPH)
|
||||
async def test_models(model: str, tp_size: int, dp_size: int,
|
||||
full_graph: bool) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching",
|
||||
"--enable-expert-parallel",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--data-parallel-size",
|
||||
str(dp_size),
|
||||
"--port",
|
||||
str(port),
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-batched-tokens",
|
||||
"8192",
|
||||
"--block-size",
|
||||
"16",
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
]
|
||||
if full_graph:
|
||||
server_args += [
|
||||
"--compilation-config",
|
||||
'{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,110 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"moonshotai/Kimi-K2-Thinking",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [16]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 4096,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 512,
|
||||
"max_out_len": 256,
|
||||
"batch_size": 64,
|
||||
"trust_remote_code": True,
|
||||
"request_rate": 11.2,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True"
|
||||
}
|
||||
server_args = [
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--port",
|
||||
str(port),
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-batched-tokens",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"12",
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
"--trust-remote-code",
|
||||
"--enable-expert-parallel",
|
||||
"--no-enable-prefix-caching",
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,140 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/DeepSeek-R1-0528-W8A8",
|
||||
]
|
||||
|
||||
MODES = ["mtp2", "mtp3"]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_gsm8k = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
|
||||
aisbench_aime = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/aime2024",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "aime2024/aime2024_gen_0_shot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 86.67,
|
||||
"threshold": 7
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "100",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"VLLM_RPC_TIMEOUT": "3600000",
|
||||
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
|
||||
}
|
||||
speculative_config = {"num_speculative_tokens": 2, "method": "mtp"}
|
||||
compilation_config = {
|
||||
"cudagraph_capture_sizes": [56],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
}
|
||||
server_args = [
|
||||
"--quantization",
|
||||
"ascend",
|
||||
"--seed",
|
||||
"1024",
|
||||
"--no-enable-prefix-caching",
|
||||
"--data-parallel-size",
|
||||
"2",
|
||||
"--tensor-parallel-size",
|
||||
"8",
|
||||
"--enable-expert-parallel",
|
||||
"--port",
|
||||
str(port),
|
||||
"--max-model-len",
|
||||
"40960",
|
||||
"--max-num-seqs",
|
||||
"14",
|
||||
"--trust-remote-code",
|
||||
]
|
||||
if mode == "mtp2":
|
||||
server_args.extend(["--max-num-batched-tokens", "4096"])
|
||||
server_args.extend(
|
||||
["--speculative-config",
|
||||
json.dumps(speculative_config)])
|
||||
server_args.extend(["--gpu-memory-utilization", "0.92"])
|
||||
aisbench_cases = aisbench_gsm8k
|
||||
if mode == "mtp3":
|
||||
env_dict["HCCL_OP_EXPANSION_MODE"] = "AIV"
|
||||
server_args.extend(["--max-num-batched-tokens", "2048"])
|
||||
speculative_config["num_speculative_tokens"] = 3
|
||||
server_args.extend(
|
||||
["--speculative-config",
|
||||
json.dumps(speculative_config)])
|
||||
server_args.extend(["--gpu-memory-utilization", "0.9"])
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
aisbench_cases = aisbench_aime
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model,
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
@@ -1,107 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import get_TTFT, run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/DeepSeek-R1-0528-W8A8",
|
||||
]
|
||||
|
||||
aisbench_warm_up = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in1024-bs210",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 210,
|
||||
"max_out_len": 2,
|
||||
"batch_size": 1000,
|
||||
"baseline": 0,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
aisbench_cases0 = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/prefix0-in3500-bs210",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 210,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 18,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
aisbench_cases75 = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/prefix75-in3500-bs210",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 210,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 18,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
async def test_models(model: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
}
|
||||
additional_config = {"enable_weight_nz_layout": True}
|
||||
speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--data-parallel-size", "2",
|
||||
"--tensor-parallel-size", "8", "--enable-expert-parallel", "--port",
|
||||
str(port), "--seed", "1024", "--max-model-len", "5200",
|
||||
"--max-num-batched-tokens", "4096", "--max-num-seqs", "16",
|
||||
"--trust-remote-code", "--gpu-memory-utilization", "0.9",
|
||||
"--additional-config",
|
||||
json.dumps(additional_config), "--speculative-config",
|
||||
json.dumps(speculative_config)
|
||||
]
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False):
|
||||
run_aisbench_cases(model, port, aisbench_warm_up)
|
||||
result = run_aisbench_cases(model, port, aisbench_cases0)
|
||||
TTFT0 = get_TTFT(result)
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False):
|
||||
run_aisbench_cases(model, port, aisbench_warm_up)
|
||||
result = run_aisbench_cases(model, port, aisbench_cases75)
|
||||
TTFT75 = get_TTFT(result)
|
||||
assert TTFT75 < 0.8 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.8*TTFT for prefix0 {TTFT0}."
|
||||
print(
|
||||
f"The TTFT for prefix75 {TTFT75} is less than 0.8*TTFT for prefix0 {TTFT0}."
|
||||
)
|
||||
@@ -1,99 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import get_TTFT, run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/Qwen3-32B-W8A8",
|
||||
]
|
||||
|
||||
aisbench_warm_up = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in1024-bs210",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 210,
|
||||
"max_out_len": 2,
|
||||
"batch_size": 1000,
|
||||
"baseline": 0,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
aisbench_cases0 = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/prefix0-in3500-bs210",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 210,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 48,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
aisbench_cases75 = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/prefix75-in3500-bs210",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 210,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 48,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
async def test_models(model: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
|
||||
additional_config = {"enable_weight_nz_layout": True}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--reasoning-parser", "qwen3",
|
||||
"--tensor-parallel-size", "4", "--port",
|
||||
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
|
||||
"8192", "--max-num-seqs", "256", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.9", "--additional-config",
|
||||
json.dumps(additional_config)
|
||||
]
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False):
|
||||
run_aisbench_cases(model, port, aisbench_warm_up)
|
||||
result = run_aisbench_cases(model, port, aisbench_cases0)
|
||||
TTFT0 = get_TTFT(result)
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False):
|
||||
run_aisbench_cases(model, port, aisbench_warm_up)
|
||||
result = run_aisbench_cases(model, port, aisbench_cases75)
|
||||
TTFT75 = get_TTFT(result)
|
||||
assert TTFT75 < 0.8 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.8*TTFT for prefix0 {TTFT0}."
|
||||
print(
|
||||
f"The TTFT for prefix75 {TTFT75} is less than 0.8*TTFT for prefix0 {TTFT0}."
|
||||
)
|
||||
@@ -1,110 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
from tools.send_mm_request import send_image_request
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-VL-32B-Instruct",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [4]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/textvqa-lite",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"max_out_len": 2048,
|
||||
"batch_size": 128,
|
||||
"baseline": 76.22,
|
||||
"temperature": 0,
|
||||
"top_k": -1,
|
||||
"top_p": 1,
|
||||
"repetition_penalty": 1,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"num_prompts": 512,
|
||||
"max_out_len": 256,
|
||||
"batch_size": 128,
|
||||
"temperature": 0,
|
||||
"top_k": -1,
|
||||
"top_p": 1,
|
||||
"repetition_penalty": 1,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"VLLM_ASCEND_ENABLE_NZ": "0",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV"
|
||||
}
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--mm-processor-cache-gb", "0",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--port",
|
||||
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
|
||||
"40000", "--max-num-seqs", "400", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.8", "--compilation_config",
|
||||
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
send_image_request(model, server)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,102 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
from tools.send_mm_request import send_image_request
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [4]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/textvqa-lite",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"max_out_len": 2048,
|
||||
"batch_size": 128,
|
||||
"baseline": 82.05,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"num_prompts": 512,
|
||||
"max_out_len": 256,
|
||||
"batch_size": 128,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"VLLM_ASCEND_ENABLE_NZ": "0",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV"
|
||||
}
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--mm-processor-cache-gb", "0",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--port",
|
||||
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
|
||||
"40000", "--max-num-seqs", "400", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.8", "--compilation_config",
|
||||
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
send_image_request(model, server)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,110 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
]
|
||||
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
|
||||
TENSOR_PARALLELS = [1]
|
||||
|
||||
warmup_cases = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"num_prompts": 50,
|
||||
"max_out_len": 20,
|
||||
"batch_size": 32,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/textvqa-lite",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"max_out_len": 2048,
|
||||
"batch_size": 128,
|
||||
"baseline": 82.05,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/textvqa-perf-1080p",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "textvqa/textvqa_gen_base64",
|
||||
"num_prompts": 512,
|
||||
"max_out_len": 256,
|
||||
"batch_size": 128,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
encode_port = get_open_port()
|
||||
pd_port = get_open_port()
|
||||
vllm_server_args = [
|
||||
[
|
||||
"--port",
|
||||
str(encode_port), "--model", model, "--gpu-memory-utilization",
|
||||
"0.01", "--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
|
||||
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
|
||||
"--max-num-seqs", "1", "--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
|
||||
],
|
||||
[
|
||||
"--port",
|
||||
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
|
||||
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
|
||||
"--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
|
||||
]
|
||||
]
|
||||
proxy_port = get_open_port()
|
||||
proxy_args = [
|
||||
"--host", "127.0.0.1", "--port",
|
||||
str(proxy_port), "--encode-servers-urls",
|
||||
f"http://localhost:{encode_port}", "--decode-servers-urls",
|
||||
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
|
||||
]
|
||||
|
||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
|
||||
with DisaggEpdProxy(proxy_args=proxy_args) as _:
|
||||
# warm up
|
||||
run_aisbench_cases(model=model,
|
||||
port=proxy_port,
|
||||
aisbench_cases=warmup_cases)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, proxy_port, aisbench_cases)
|
||||
@@ -1,71 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
from .test_qwen3_235b_w8a8 import *
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
async def test_models_eplb(model: str) -> None:
|
||||
port, aisbench_cases, env_dict, compilation_config, server_args = config()
|
||||
env_dict.update(
|
||||
{
|
||||
"DYNAMIC_EPLB": "true",
|
||||
}
|
||||
)
|
||||
additional_config: dict[str, Any] = {}
|
||||
additional_config["eplb_config"] = {
|
||||
"dynamic_eplb": "true",
|
||||
"expert_heat_collection_interval": 600,
|
||||
"algorithm_execution_interval": 50,
|
||||
"num_redundant_experts": 16,
|
||||
"eplb_policy_type": 2,
|
||||
}
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model,
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
@@ -1,104 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/Qwen3-235B-A22B-W8A8",
|
||||
]
|
||||
|
||||
MODES = ["full_graph", "piecewise"]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
def config():
|
||||
port = get_open_port()
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"top_k": 20,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
||||
}
|
||||
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--async-scheduling",
|
||||
"--data-parallel-size", "4", "--tensor-parallel-size", "4",
|
||||
"--enable-expert-parallel", "--port",
|
||||
str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
|
||||
"8192", "--max-num-seqs", "12", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.9"
|
||||
]
|
||||
return port, aisbench_cases, env_dict, compilation_config, server_args
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
port, aisbench_cases, env_dict, compilation_config, server_args = config()
|
||||
if mode == "piecewise":
|
||||
compilation_config["cudagraph_mode"] = "PIECEWISE"
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model,
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
@@ -1,92 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [1]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 180,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 45,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_PROC_BIND": "false",
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True"
|
||||
}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--async-scheduling",
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size",
|
||||
str(tp_size), "--port",
|
||||
str(port), "--max-model-len", "5600", "--max-num-batched-tokens",
|
||||
"16384", "--max-num-seqs", "100", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.9", "--compilation-config",
|
||||
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,99 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-32B",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [4]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 80,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 20,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"PAGED_ATTENTION_MASK_LEN": "5500"
|
||||
}
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size",
|
||||
str(tp_size), "--port",
|
||||
str(port), "--max-model-len", "36864", "--max-num-batched-tokens",
|
||||
"36864", "--block-size", "128", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.9", "--additional-config",
|
||||
'{"enable_weight_nz_layout":true}'
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,129 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/Qwen3-32B-W8A8",
|
||||
]
|
||||
|
||||
MODES = [
|
||||
"aclgraph",
|
||||
"single",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [4]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
batch_size_dict = {
|
||||
"linux-aarch64-a2b3-4": 72,
|
||||
"linux-aarch64-a3-4": 76,
|
||||
}
|
||||
VLLM_CI_RUNNER = os.getenv("VLLM_CI_RUNNER", "linux-aarch64-a2b3-4")
|
||||
performance_batch_size = batch_size_dict.get(VLLM_CI_RUNNER, 1)
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/aime2024",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "aime2024/aime2024_gen_0_shot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 83.33,
|
||||
"threshold": 7
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 4 * performance_batch_size,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": performance_batch_size,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, mode: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
|
||||
}
|
||||
compilation_config = {
|
||||
"cudagraph_mode":
|
||||
"FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes":
|
||||
[1, 12, 16, 20, 24, 32, 48, 60, 64, 68, 72, 76, 80]
|
||||
}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--no-enable-prefix-caching",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--port",
|
||||
str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
|
||||
"40960", "--block-size", "128", "--trust-remote-code",
|
||||
"--reasoning-parser", "qwen3", "--gpu-memory-utilization", "0.9",
|
||||
"--async-scheduling", "--additional-config",
|
||||
'{"weight_prefetch_config":{"enabled":true}}',
|
||||
]
|
||||
if mode == "single":
|
||||
server_args.append("--enforce-eager")
|
||||
if mode == "aclgraph":
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
if mode == "single":
|
||||
return
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,98 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
from tools.send_request import send_v1_chat_completions
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/Qwen3-32B-W8A8",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [4]
|
||||
|
||||
prompts = [
|
||||
"9.11 and 9.8, which is greater?",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"chat_template_kwargs": {
|
||||
"enable_thinking": True
|
||||
},
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt",
|
||||
"max_out_len": 10240,
|
||||
"batch_size": 32,
|
||||
"baseline": 96,
|
||||
"threshold": 4
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 240,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 60,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"VLLM_USE": "1",
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
|
||||
}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--tensor-parallel-size",
|
||||
str(tp_size), "--port",
|
||||
str(port), "--trust-remote-code", "--reasoning-parser", "qwen3",
|
||||
"--distributed_executor_backend", "mp", "--gpu-memory-utilization",
|
||||
"0.9", "--block-size", "128", "--max-num-seqs", "256",
|
||||
"--enforce-eager", "--max-model-len", "35840",
|
||||
"--max-num-batched-tokens", "35840", "--additional-config",
|
||||
'{"enable_weight_nz_layout":true, "weight_prefetch_config":{"enabled": true}}',
|
||||
"--compilation-config",
|
||||
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
|
||||
]
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
send_v1_chat_completions(prompts[0],
|
||||
model,
|
||||
server,
|
||||
request_args=api_keyword_args)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,111 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-Next-80B-A3B-Instruct",
|
||||
]
|
||||
|
||||
MODES = ["aclgraph"]
|
||||
|
||||
TENSOR_PARALLELS = [4]
|
||||
MAX_NUM_BATCHED_TOKENS = [8192, 32768]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
batch_size_dict = {
|
||||
"linux-aarch64-a2b3-4": 64,
|
||||
"linux-aarch64-a3-4": 64,
|
||||
}
|
||||
VLLM_CI_RUNNER = os.getenv("VLLM_CI_RUNNER", "linux-aarch64-a2b3-4")
|
||||
performance_batch_size = batch_size_dict.get(VLLM_CI_RUNNER, 1)
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 4 * performance_batch_size,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": performance_batch_size,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}, {
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"top_k": 20,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@pytest.mark.parametrize("max_num_batched_tokens", MAX_NUM_BATCHED_TOKENS)
|
||||
async def test_models(model: str, mode: str, tp_size: int,
|
||||
max_num_batched_tokens: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
}
|
||||
server_args = [
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--port",
|
||||
str(port),
|
||||
"--max-model-len",
|
||||
"40960",
|
||||
"--max-num-batched-tokens",
|
||||
str(max_num_batched_tokens),
|
||||
"--trust-remote-code",
|
||||
"--async-scheduling",
|
||||
"--no-enable-prefix-caching",
|
||||
"--enable-expert-parallel",
|
||||
"--gpu-memory-utilization",
|
||||
"0.8",
|
||||
"--max-num-seqs",
|
||||
"64",
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
if mode == "single":
|
||||
return
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -1,104 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
async def test_models(model: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
}
|
||||
server_args = [
|
||||
"--quantization",
|
||||
"ascend",
|
||||
"--async-scheduling",
|
||||
"--no-enable-prefix-caching",
|
||||
"--data-parallel-size",
|
||||
"1",
|
||||
"--tensor-parallel-size",
|
||||
"4",
|
||||
"--enable-expert-parallel",
|
||||
"--port",
|
||||
str(port),
|
||||
"--max-model-len",
|
||||
"40960",
|
||||
"--max-num-batched-tokens",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"32",
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.65",
|
||||
"--compilation-config",
|
||||
'{"cudagraph_capture_sizes": [32]}',
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
print(choices)
|
||||
# aisbench test
|
||||
run_aisbench_cases(model,
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
@@ -1,115 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"Qwen/QwQ-32B",
|
||||
]
|
||||
|
||||
MODES = [
|
||||
"aclgraph",
|
||||
"single",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [4]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}, {
|
||||
"case_type": "performance",
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 240,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 60,
|
||||
"baseline": 1,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, mode: str, tp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
|
||||
"VLLM_ASCEND_ENABLE_DEBSE_OPTIMIZE": "1"
|
||||
}
|
||||
server_args = [
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--port",
|
||||
str(port), "--max-model-len", "36864", "--max-num-batched-tokens",
|
||||
"36864", "--block-size", "128", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.9", "--compilation_config",
|
||||
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}',
|
||||
"--reasoning-parser", "deepseek_r1", "--distributed_executor_backend",
|
||||
"mp", "--additional-config", '{"weight_prefetch_config":{"enabled":true}}'
|
||||
]
|
||||
if mode == "single":
|
||||
server_args.remove("--compilation_config")
|
||||
server_args.remove(
|
||||
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
|
||||
)
|
||||
server_args.append("--enforce-eager")
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
)
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
if mode == "single":
|
||||
return
|
||||
# aisbench test
|
||||
run_aisbench_cases(model, port, aisbench_cases)
|
||||
@@ -245,9 +245,11 @@ def run_aisbench_cases(model, port, aisbench_cases, server_args="", host_ip="loc
|
||||
return aisbench_results
|
||||
|
||||
|
||||
def get_TTFT(result):
|
||||
TTFT = result[0][0].loc["TTFT", "Average"][:-3]
|
||||
return float(TTFT)
|
||||
def get_TTFT(results):
|
||||
TTFT = []
|
||||
for i in range(len(results)):
|
||||
TTFT.append(float(results[i][0].loc["TTFT", "Average"][:-3]))
|
||||
return TTFT
|
||||
|
||||
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
Reference in New Issue
Block a user