[CI] Enable FLASHCOMM1 with layer_sharding and FULL_DECODE_ONLY in ds32 testing (#6115)
### What this PR does / why we need it?
This PR enables FLASHCOMM1 communication optimization with layer
sharding for DeepSeek-V3.2 W8A8 model testing to
validate PR #5702. The changes include:
1. Enable FLASHCOMM1: Set VLLM_ASCEND_ENABLE_FLASHCOMM1=1
improves performance for distributed inference
2. Add layer sharding: Configure layer_sharding: ["q_b_proj", "o_proj"]
4. Update baselines: Adjust performance baselines to reflect the
improvements from FLASHCOMM1 and layer sharding
### Does this PR introduce _any_ user-facing change?
No. This is a CI/test-only change that enables new communication
optimization features for testing purposes.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
d68209402d
Signed-off-by: guozr <guozr1997@hotmail.com>
Co-authored-by: guozr <guozr1997@hotmail.com>
This commit is contained in:
@@ -54,7 +54,7 @@ aisbench_cases = [{
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 4,
|
||||
"request_rate": 11.2,
|
||||
"baseline": 120,
|
||||
"baseline": 134,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
@@ -72,7 +72,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"VLLM_ASCEND_ENABLE_MLAPO": "1",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1",
|
||||
}
|
||||
|
||||
server_args = [
|
||||
@@ -85,6 +85,8 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
||||
'{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
|
||||
"--speculative-config",
|
||||
'{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
|
||||
"--additional-config",
|
||||
'{"layer_sharding": ["q_b_proj", "o_proj"]}',
|
||||
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
|
||||
Reference in New Issue
Block a user