[CI]Add Disaggregated PD Nightly Test for Qwen3-235B and Qwen3-VL-235B (#5502)
### What this PR does / why we need it?
This PR adds online **Disaggregated Prefill/Decode** performance and
accuracy tests for the **Qwen3-235B-A22B** and
**Qwen3-VL-235B-A22B-Instruct** models to the Nightly test suite.
These test configurations simulate the deployment of massive MoE and
Vision-Language models in **a dual-node (32 NPU)** environment,
utilizing Mooncake (KVCache Transfer) technology to achieve efficient KV
cache transfer between the Prefill node and the Decode node.
#### Test Configuration
**Qwen3-235B-A22B**
- Model: Qwen/Qwen3-235B-A22B
- Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node)
- Architecture: Disaggregated Prefill & Decode
- Node 0 (Producer/Prefill): **DP2 + TP8 + EP + FLASHCOMM1 +
FUSED_MC2**.
- Node 1 (Consumer/Decode): **DP4 + TP4 + EP + FLASHCOMM1 + FUSED_MC2 +
FULL_DECODE_ONLY**.
- Benchmarks:
- Performance: vllm-ascend/GSM8K-in3500-bs2800.
- Accuracy: vllm-ascend/gsm8k-lite.
**Qwen3-VL-235B-A22B-Instruct**
- Model: Qwen/Qwen3-VL-235B-A22B-Instruct
- Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node)
- Architecture: Disaggregated Prefill & Decode
- Node 0 (Producer/Prefill): **DP2 + TP8 + EP**.
- Node 1 (Consumer/Decode): **DP4 + TP4 + EP + FULL_DECODE_ONLY**.
- Benchmarks:
- Performance: vllm-ascend/textvqa-perf-1080p.
- Accuracy: vllm-ascend/textvqa-lite.
### How was this patch tested?
Nightly test action on CI
- vLLM version: v0.13.0
- vLLM main:
45c1ca1ca1
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
6
.github/workflows/nightly_test_a3.yaml
vendored
6
.github/workflows/nightly_test_a3.yaml
vendored
@@ -77,6 +77,12 @@ jobs:
|
|||||||
- name: multi-node-qwenw8a8-2node-longseq
|
- name: multi-node-qwenw8a8-2node-longseq
|
||||||
config_file_path: Qwen3-235B-W8A8-longseq.yaml
|
config_file_path: Qwen3-235B-W8A8-longseq.yaml
|
||||||
size: 2
|
size: 2
|
||||||
|
- name: multi-node-qwen-disagg-pd
|
||||||
|
config_file_path: Qwen3-235B-disagg-pd.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-qwen-vl-disagg-pd
|
||||||
|
config_file_path: Qwen3-VL-235B-disagg-pd.yaml
|
||||||
|
size: 2
|
||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
with:
|
with:
|
||||||
soc_version: a3
|
soc_version: a3
|
||||||
|
|||||||
121
tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml
Normal file
121
tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
test_name: "test Qwen3-235B-A22B disaggregated_prefill"
|
||||||
|
model: "Qwen/Qwen3-235B-A22B"
|
||||||
|
num_nodes: 2
|
||||||
|
npu_per_node: 16
|
||||||
|
env_common:
|
||||||
|
VLLM_USE_MODELSCOPE: true
|
||||||
|
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
|
||||||
|
HCCL_BUFFSIZE: 1024
|
||||||
|
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||||
|
OMP_PROC_BIND: false
|
||||||
|
OMP_NUM_THREADS: 1
|
||||||
|
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
|
||||||
|
VLLM_ASCEND_ENABLE_FUSED_MC2: 2
|
||||||
|
TASK_QUEUE_ENABLE: 1
|
||||||
|
SERVER_PORT: 8080
|
||||||
|
|
||||||
|
disaggregated_prefill:
|
||||||
|
enabled: true
|
||||||
|
prefiller_host_index: [0]
|
||||||
|
decoder_host_index: [1]
|
||||||
|
|
||||||
|
deployment:
|
||||||
|
-
|
||||||
|
server_cmd: >
|
||||||
|
vllm serve "Qwen/Qwen3-235B-A22B"
|
||||||
|
--host 0.0.0.0
|
||||||
|
--port $SERVER_PORT
|
||||||
|
--data-parallel-size 2
|
||||||
|
--data-parallel-size-local 2
|
||||||
|
--data-parallel-start-rank 0
|
||||||
|
--data-parallel-address $LOCAL_IP
|
||||||
|
--data-parallel-rpc-port 13389
|
||||||
|
--tensor-parallel-size 8
|
||||||
|
--seed 1024
|
||||||
|
--max-num-seqs 32
|
||||||
|
--max-model-len 8192
|
||||||
|
--max-num-batched-tokens 8192
|
||||||
|
--enable-expert-parallel
|
||||||
|
--trust-remote-code
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
--no-enable-prefix-caching
|
||||||
|
--kv-transfer-config
|
||||||
|
'{"kv_connector": "MooncakeConnectorV1",
|
||||||
|
"kv_role": "kv_producer",
|
||||||
|
"kv_port": "30000",
|
||||||
|
"engine_id": "0",
|
||||||
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"use_ascend_direct": true,
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 4,
|
||||||
|
"tp_size": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
|
||||||
|
-
|
||||||
|
server_cmd: >
|
||||||
|
vllm serve "Qwen/Qwen3-235B-A22B"
|
||||||
|
--host 0.0.0.0
|
||||||
|
--port $SERVER_PORT
|
||||||
|
--data-parallel-size 4
|
||||||
|
--data-parallel-size-local 4
|
||||||
|
--data-parallel-start-rank 0
|
||||||
|
--data-parallel-address $LOCAL_IP
|
||||||
|
--data-parallel-rpc-port 13389
|
||||||
|
--tensor-parallel-size 4
|
||||||
|
--seed 1024
|
||||||
|
--max-num-seqs 32
|
||||||
|
--max-model-len 8192
|
||||||
|
--max-num-batched-tokens 8192
|
||||||
|
--enable-expert-parallel
|
||||||
|
--trust-remote-code
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
--no-enable-prefix-caching
|
||||||
|
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
|
||||||
|
--async-scheduling
|
||||||
|
--kv-transfer-config
|
||||||
|
'{"kv_connector": "MooncakeConnectorV1",
|
||||||
|
"kv_role": "kv_consumer",
|
||||||
|
"kv_port": "30100",
|
||||||
|
"engine_id": "1",
|
||||||
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"use_ascend_direct": true,
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 4,
|
||||||
|
"tp_size": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
|
||||||
|
benchmarks:
|
||||||
|
perf:
|
||||||
|
case_type: performance
|
||||||
|
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
|
||||||
|
request_conf: vllm_api_stream_chat
|
||||||
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||||
|
num_prompts: 2800
|
||||||
|
max_out_len: 1500
|
||||||
|
batch_size: 700
|
||||||
|
request_rate: 11.2
|
||||||
|
baseline: 1
|
||||||
|
threshold: 0.97
|
||||||
|
acc:
|
||||||
|
case_type: accuracy
|
||||||
|
dataset_path: vllm-ascend/gsm8k-lite
|
||||||
|
request_conf: vllm_api_general_chat
|
||||||
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||||
|
max_out_len: 7680
|
||||||
|
batch_size: 512
|
||||||
|
baseline: 97
|
||||||
|
threshold: 3
|
||||||
108
tests/e2e/nightly/multi_node/config/Qwen3-VL-235B-disagg-pd.yaml
Normal file
108
tests/e2e/nightly/multi_node/config/Qwen3-VL-235B-disagg-pd.yaml
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
test_name: "test Qwen3-VL-235B-A22B disaggregated_prefill"
|
||||||
|
model: "Qwen/Qwen3-VL-235B-A22B-Instruct"
|
||||||
|
num_nodes: 2
|
||||||
|
npu_per_node: 16
|
||||||
|
env_common:
|
||||||
|
VLLM_USE_MODELSCOPE: true
|
||||||
|
HCCL_BUFFSIZE: 1024
|
||||||
|
SERVER_PORT: 8080
|
||||||
|
OMP_PROC_BIND: false
|
||||||
|
OMP_NUM_THREADS: 1
|
||||||
|
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||||
|
TASK_QUEUE_ENABLE: 1
|
||||||
|
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
|
||||||
|
|
||||||
|
disaggregated_prefill:
|
||||||
|
enabled: true
|
||||||
|
prefiller_host_index: [0]
|
||||||
|
decoder_host_index: [1]
|
||||||
|
|
||||||
|
deployment:
|
||||||
|
-
|
||||||
|
server_cmd: >
|
||||||
|
vllm serve "Qwen/Qwen3-VL-235B-A22B-Instruct"
|
||||||
|
--host 0.0.0.0
|
||||||
|
--port $SERVER_PORT
|
||||||
|
--data-parallel-size 2
|
||||||
|
--data-parallel-size-local 2
|
||||||
|
--tensor-parallel-size 8
|
||||||
|
--seed 1024
|
||||||
|
--enable-expert-parallel
|
||||||
|
--max-num-seqs 32
|
||||||
|
--max-model-len 8192
|
||||||
|
--max-num-batched-tokens 8192
|
||||||
|
--trust-remote-code
|
||||||
|
--no-enable-prefix-caching
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
--kv-transfer-config
|
||||||
|
'{"kv_connector": "MooncakeConnectorV1",
|
||||||
|
"kv_role": "kv_producer",
|
||||||
|
"kv_port": "30000",
|
||||||
|
"engine_id": "0",
|
||||||
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 4,
|
||||||
|
"tp_size": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
-
|
||||||
|
server_cmd: >
|
||||||
|
vllm serve "Qwen/Qwen3-VL-235B-A22B-Instruct"
|
||||||
|
--host 0.0.0.0
|
||||||
|
--port $SERVER_PORT
|
||||||
|
--data-parallel-size 4
|
||||||
|
--data-parallel-size-local 4
|
||||||
|
--tensor-parallel-size 4
|
||||||
|
--seed 1024
|
||||||
|
--enable-expert-parallel
|
||||||
|
--max-num-seqs 32
|
||||||
|
--max-model-len 8192
|
||||||
|
--max-num-batched-tokens 8192
|
||||||
|
--trust-remote-code
|
||||||
|
--no-enable-prefix-caching
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
|
||||||
|
--kv-transfer-config
|
||||||
|
'{"kv_connector": "MooncakeConnectorV1",
|
||||||
|
"kv_role": "kv_consumer",
|
||||||
|
"kv_port": "30200",
|
||||||
|
"engine_id": "1",
|
||||||
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 4,
|
||||||
|
"tp_size": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
benchmarks:
|
||||||
|
perf:
|
||||||
|
case_type: performance
|
||||||
|
dataset_path: vllm-ascend/textvqa-perf-1080p
|
||||||
|
request_conf: vllm_api_stream_chat
|
||||||
|
dataset_conf: textvqa/textvqa_gen_base64
|
||||||
|
num_prompts: 2800
|
||||||
|
max_out_len: 1500
|
||||||
|
batch_size: 64
|
||||||
|
request_rate: 11.2
|
||||||
|
baseline: 1
|
||||||
|
threshold: 0.97
|
||||||
|
acc:
|
||||||
|
case_type: accuracy
|
||||||
|
dataset_path: vllm-ascend/textvqa-lite
|
||||||
|
request_conf: vllm_api_stream_chat
|
||||||
|
dataset_conf: textvqa/textvqa_gen_base64
|
||||||
|
max_out_len: 7680
|
||||||
|
batch_size: 64
|
||||||
|
baseline: 85
|
||||||
|
threshold: 5
|
||||||
@@ -108,8 +108,12 @@ class AscendConfig:
|
|||||||
decode_tp_size = min(decode_tp_size, num_kv_head)
|
decode_tp_size = min(decode_tp_size, num_kv_head)
|
||||||
self.pd_head_ratio = prefill_tp_size // decode_tp_size
|
self.pd_head_ratio = prefill_tp_size // decode_tp_size
|
||||||
except Exception:
|
except Exception:
|
||||||
raise AssertionError(
|
raise ValueError(
|
||||||
"Can not get num_key_value_heads from model_config")
|
"The text_config extracted from the model config does not have "
|
||||||
|
"`num_key_value_heads` attribute. This indicates a mismatch "
|
||||||
|
"between the model config and vLLM's expectations. Please "
|
||||||
|
"ensure that the model config is compatible with vLLM."
|
||||||
|
)
|
||||||
|
|
||||||
if self.pd_tp_ratio == 0:
|
if self.pd_tp_ratio == 0:
|
||||||
raise AssertionError(
|
raise AssertionError(
|
||||||
|
|||||||
Reference in New Issue
Block a user