[CI] Enable FLASHCOMM1 with layer_sharding and FULL_DECODE_ONLY in ds32 testing (#6115)

### What this PR does / why we need it? This PR enables FLASHCOMM1 communication optimization with layer sharding for DeepSeek-V3.2 W8A8 model testing to validate PR #5702. The changes include: 1. Enable FLASHCOMM1: Set VLLM_ASCEND_ENABLE_FLASHCOMM1=1 improves performance for distributed inference 2. Add layer sharding: Configure layer_sharding: ["q_b_proj", "o_proj"] 4. Update baselines: Adjust performance baselines to reflect the improvements from FLASHCOMM1 and layer sharding ### Does this PR introduce _any_ user-facing change? No. This is a CI/test-only change that enables new communication optimization features for testing purposes. ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: d68209402d Signed-off-by: guozr <guozr1997@hotmail.com> Co-authored-by: guozr <guozr1997@hotmail.com>
2026-01-23 19:48:37 +08:00
parent 8786412f5c
commit 6c73b88dd6
3 changed files with 13 additions and 5 deletions
--- a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
@@ -54,7 +54,7 @@ aisbench_cases = [{
    "max_out_len": 1500,
    "batch_size": 4,
    "request_rate": 11.2,
-    "baseline": 120,
+    "baseline": 134,
    "threshold": 0.97
 }]

@@ -72,7 +72,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
        "HCCL_BUFFSIZE": "1024",
        "VLLM_ASCEND_ENABLE_MLAPO": "1",
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
-        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "0",
+        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1",
    }

    server_args = [
@@ -85,6 +85,8 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
        '{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
        "--speculative-config",
        '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
+        "--additional-config",
+        '{"layer_sharding": ["q_b_proj", "o_proj"]}',
        "--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
    ]
    request_keyword_args: dict[str, Any] = {