[CI]Add Disaggregated PD Nightly Test for Qwen3-235B and Qwen3-VL-235B (#5502)
### What this PR does / why we need it?
This PR adds online **Disaggregated Prefill/Decode** performance and
accuracy tests for the **Qwen3-235B-A22B** and
**Qwen3-VL-235B-A22B-Instruct** models to the Nightly test suite.
These test configurations simulate the deployment of massive MoE and
Vision-Language models in **a dual-node (32 NPU)** environment,
utilizing Mooncake (KVCache Transfer) technology to achieve efficient KV
cache transfer between the Prefill node and the Decode node.
#### Test Configuration
**Qwen3-235B-A22B**
- Model: Qwen/Qwen3-235B-A22B
- Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node)
- Architecture: Disaggregated Prefill & Decode
- Node 0 (Producer/Prefill): **DP2 + TP8 + EP + FLASHCOMM1 +
FUSED_MC2**.
- Node 1 (Consumer/Decode): **DP4 + TP4 + EP + FLASHCOMM1 + FUSED_MC2 +
FULL_DECODE_ONLY**.
- Benchmarks:
- Performance: vllm-ascend/GSM8K-in3500-bs2800.
- Accuracy: vllm-ascend/gsm8k-lite.
**Qwen3-VL-235B-A22B-Instruct**
- Model: Qwen/Qwen3-VL-235B-A22B-Instruct
- Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node)
- Architecture: Disaggregated Prefill & Decode
- Node 0 (Producer/Prefill): **DP2 + TP8 + EP**.
- Node 1 (Consumer/Decode): **DP4 + TP4 + EP + FULL_DECODE_ONLY**.
- Benchmarks:
- Performance: vllm-ascend/textvqa-perf-1080p.
- Accuracy: vllm-ascend/textvqa-lite.
### How was this patch tested?
Nightly test action on CI
- vLLM version: v0.13.0
- vLLM main:
45c1ca1ca1
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
6
.github/workflows/nightly_test_a3.yaml
vendored
6
.github/workflows/nightly_test_a3.yaml
vendored
@@ -77,6 +77,12 @@ jobs:
|
||||
- name: multi-node-qwenw8a8-2node-longseq
|
||||
config_file_path: Qwen3-235B-W8A8-longseq.yaml
|
||||
size: 2
|
||||
- name: multi-node-qwen-disagg-pd
|
||||
config_file_path: Qwen3-235B-disagg-pd.yaml
|
||||
size: 2
|
||||
- name: multi-node-qwen-vl-disagg-pd
|
||||
config_file_path: Qwen3-VL-235B-disagg-pd.yaml
|
||||
size: 2
|
||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||
with:
|
||||
soc_version: a3
|
||||
|
||||
121
tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml
Normal file
121
tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml
Normal file
@@ -0,0 +1,121 @@
|
||||
test_name: "test Qwen3-235B-A22B disaggregated_prefill"
|
||||
model: "Qwen/Qwen3-235B-A22B"
|
||||
num_nodes: 2
|
||||
npu_per_node: 16
|
||||
env_common:
|
||||
VLLM_USE_MODELSCOPE: true
|
||||
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
|
||||
HCCL_BUFFSIZE: 1024
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
OMP_PROC_BIND: false
|
||||
OMP_NUM_THREADS: 1
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
|
||||
VLLM_ASCEND_ENABLE_FUSED_MC2: 2
|
||||
TASK_QUEUE_ENABLE: 1
|
||||
SERVER_PORT: 8080
|
||||
|
||||
disaggregated_prefill:
|
||||
enabled: true
|
||||
prefiller_host_index: [0]
|
||||
decoder_host_index: [1]
|
||||
|
||||
deployment:
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve "Qwen/Qwen3-235B-A22B"
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--data-parallel-size 2
|
||||
--data-parallel-size-local 2
|
||||
--data-parallel-start-rank 0
|
||||
--data-parallel-address $LOCAL_IP
|
||||
--data-parallel-rpc-port 13389
|
||||
--tensor-parallel-size 8
|
||||
--seed 1024
|
||||
--max-num-seqs 32
|
||||
--max-model-len 8192
|
||||
--max-num-batched-tokens 8192
|
||||
--enable-expert-parallel
|
||||
--trust-remote-code
|
||||
--gpu-memory-utilization 0.9
|
||||
--no-enable-prefix-caching
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
"tp_size": 8
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 4,
|
||||
"tp_size": 4
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve "Qwen/Qwen3-235B-A22B"
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--data-parallel-size 4
|
||||
--data-parallel-size-local 4
|
||||
--data-parallel-start-rank 0
|
||||
--data-parallel-address $LOCAL_IP
|
||||
--data-parallel-rpc-port 13389
|
||||
--tensor-parallel-size 4
|
||||
--seed 1024
|
||||
--max-num-seqs 32
|
||||
--max-model-len 8192
|
||||
--max-num-batched-tokens 8192
|
||||
--enable-expert-parallel
|
||||
--trust-remote-code
|
||||
--gpu-memory-utilization 0.9
|
||||
--no-enable-prefix-caching
|
||||
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
|
||||
--async-scheduling
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"use_ascend_direct": true,
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
"tp_size": 8
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 4,
|
||||
"tp_size": 4
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 2800
|
||||
max_out_len: 1500
|
||||
batch_size: 700
|
||||
request_rate: 11.2
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 7680
|
||||
batch_size: 512
|
||||
baseline: 97
|
||||
threshold: 3
|
||||
108
tests/e2e/nightly/multi_node/config/Qwen3-VL-235B-disagg-pd.yaml
Normal file
108
tests/e2e/nightly/multi_node/config/Qwen3-VL-235B-disagg-pd.yaml
Normal file
@@ -0,0 +1,108 @@
|
||||
test_name: "test Qwen3-VL-235B-A22B disaggregated_prefill"
|
||||
model: "Qwen/Qwen3-VL-235B-A22B-Instruct"
|
||||
num_nodes: 2
|
||||
npu_per_node: 16
|
||||
env_common:
|
||||
VLLM_USE_MODELSCOPE: true
|
||||
HCCL_BUFFSIZE: 1024
|
||||
SERVER_PORT: 8080
|
||||
OMP_PROC_BIND: false
|
||||
OMP_NUM_THREADS: 1
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
TASK_QUEUE_ENABLE: 1
|
||||
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
|
||||
|
||||
disaggregated_prefill:
|
||||
enabled: true
|
||||
prefiller_host_index: [0]
|
||||
decoder_host_index: [1]
|
||||
|
||||
deployment:
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve "Qwen/Qwen3-VL-235B-A22B-Instruct"
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--data-parallel-size 2
|
||||
--data-parallel-size-local 2
|
||||
--tensor-parallel-size 8
|
||||
--seed 1024
|
||||
--enable-expert-parallel
|
||||
--max-num-seqs 32
|
||||
--max-model-len 8192
|
||||
--max-num-batched-tokens 8192
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
"tp_size": 8
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 4,
|
||||
"tp_size": 4
|
||||
}
|
||||
}
|
||||
}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve "Qwen/Qwen3-VL-235B-A22B-Instruct"
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--data-parallel-size 4
|
||||
--data-parallel-size-local 4
|
||||
--tensor-parallel-size 4
|
||||
--seed 1024
|
||||
--enable-expert-parallel
|
||||
--max-num-seqs 32
|
||||
--max-model-len 8192
|
||||
--max-num-batched-tokens 8192
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "1",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 2,
|
||||
"tp_size": 8
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 4,
|
||||
"tp_size": 4
|
||||
}
|
||||
}
|
||||
}'
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/textvqa-perf-1080p
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
num_prompts: 2800
|
||||
max_out_len: 1500
|
||||
batch_size: 64
|
||||
request_rate: 11.2
|
||||
baseline: 1
|
||||
threshold: 0.97
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/textvqa-lite
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: textvqa/textvqa_gen_base64
|
||||
max_out_len: 7680
|
||||
batch_size: 64
|
||||
baseline: 85
|
||||
threshold: 5
|
||||
@@ -108,8 +108,12 @@ class AscendConfig:
|
||||
decode_tp_size = min(decode_tp_size, num_kv_head)
|
||||
self.pd_head_ratio = prefill_tp_size // decode_tp_size
|
||||
except Exception:
|
||||
raise AssertionError(
|
||||
"Can not get num_key_value_heads from model_config")
|
||||
raise ValueError(
|
||||
"The text_config extracted from the model config does not have "
|
||||
"`num_key_value_heads` attribute. This indicates a mismatch "
|
||||
"between the model config and vLLM's expectations. Please "
|
||||
"ensure that the model config is compatible with vLLM."
|
||||
)
|
||||
|
||||
if self.pd_tp_ratio == 0:
|
||||
raise AssertionError(
|
||||
|
||||
Reference in New Issue
Block a user