[CI] Enable FLASHCOMM1 with layer_sharding and FULL_DECODE_ONLY in ds32 testing (#6115)
### What this PR does / why we need it?
This PR enables FLASHCOMM1 communication optimization with layer
sharding for DeepSeek-V3.2 W8A8 model testing to
validate PR #5702. The changes include:
1. Enable FLASHCOMM1: Set VLLM_ASCEND_ENABLE_FLASHCOMM1=1
improves performance for distributed inference
2. Add layer sharding: Configure layer_sharding: ["q_b_proj", "o_proj"]
4. Update baselines: Adjust performance baselines to reflect the
improvements from FLASHCOMM1 and layer sharding
### Does this PR introduce _any_ user-facing change?
No. This is a CI/test-only change that enables new communication
optimization features for testing purposes.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
d68209402d
Signed-off-by: guozr <guozr1997@hotmail.com>
Co-authored-by: guozr <guozr1997@hotmail.com>
This commit is contained in:
@@ -242,7 +242,7 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
|||||||
|
|
||||||
|
|
||||||
@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
|
@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||||
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
|
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
|
||||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||||
def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||||
@@ -262,6 +262,9 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
|||||||
"num_speculative_tokens": 2,
|
"num_speculative_tokens": 2,
|
||||||
"method": "deepseek_mtp"
|
"method": "deepseek_mtp"
|
||||||
},
|
},
|
||||||
|
additional_config={
|
||||||
|
"layer_sharding":["q_b_proj", "o_proj"]
|
||||||
|
},
|
||||||
reasoning_parser="deepseek_v3",
|
reasoning_parser="deepseek_v3",
|
||||||
tokenizer_mode="deepseek_v32") as vllm_model:
|
tokenizer_mode="deepseek_v32") as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ env_common:
|
|||||||
OMP_PROC_BIND: false
|
OMP_PROC_BIND: false
|
||||||
OMP_NUM_THREADS: 1
|
OMP_NUM_THREADS: 1
|
||||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||||
VLLM_ASCEND_ENABLE_FLASHCOMM1: 0
|
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
|
||||||
ASCEND_A3_EBA_ENABLE: 1
|
ASCEND_A3_EBA_ENABLE: 1
|
||||||
|
|
||||||
|
|
||||||
@@ -37,6 +37,7 @@ deployment:
|
|||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
||||||
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||||
|
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
||||||
--tokenizer-mode deepseek_v32
|
--tokenizer-mode deepseek_v32
|
||||||
--reasoning-parser deepseek_v3
|
--reasoning-parser deepseek_v3
|
||||||
|
|
||||||
@@ -61,6 +62,7 @@ deployment:
|
|||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
||||||
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||||
|
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
||||||
--tokenizer-mode deepseek_v32
|
--tokenizer-mode deepseek_v32
|
||||||
--reasoning-parser deepseek_v3
|
--reasoning-parser deepseek_v3
|
||||||
benchmarks:
|
benchmarks:
|
||||||
@@ -73,8 +75,9 @@ benchmarks:
|
|||||||
max_out_len: 3000
|
max_out_len: 3000
|
||||||
batch_size: 512
|
batch_size: 512
|
||||||
request_rate: 11.2
|
request_rate: 11.2
|
||||||
baseline: 905.6805
|
baseline: 1253.8466
|
||||||
threshold: 0.97
|
threshold: 0.97
|
||||||
|
|
||||||
acc:
|
acc:
|
||||||
case_type: accuracy
|
case_type: accuracy
|
||||||
dataset_path: vllm-ascend/gsm8k-lite
|
dataset_path: vllm-ascend/gsm8k-lite
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ aisbench_cases = [{
|
|||||||
"max_out_len": 1500,
|
"max_out_len": 1500,
|
||||||
"batch_size": 4,
|
"batch_size": 4,
|
||||||
"request_rate": 11.2,
|
"request_rate": 11.2,
|
||||||
"baseline": 120,
|
"baseline": 134,
|
||||||
"threshold": 0.97
|
"threshold": 0.97
|
||||||
}]
|
}]
|
||||||
|
|
||||||
@@ -72,7 +72,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
|||||||
"HCCL_BUFFSIZE": "1024",
|
"HCCL_BUFFSIZE": "1024",
|
||||||
"VLLM_ASCEND_ENABLE_MLAPO": "1",
|
"VLLM_ASCEND_ENABLE_MLAPO": "1",
|
||||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0",
|
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1",
|
||||||
}
|
}
|
||||||
|
|
||||||
server_args = [
|
server_args = [
|
||||||
@@ -85,6 +85,8 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
|||||||
'{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
|
'{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
|
||||||
"--speculative-config",
|
"--speculative-config",
|
||||||
'{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
|
'{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
|
||||||
|
"--additional-config",
|
||||||
|
'{"layer_sharding": ["q_b_proj", "o_proj"]}',
|
||||||
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
|
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
|
||||||
]
|
]
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
|
|||||||
Reference in New Issue
Block a user