[CI] Enable FLASHCOMM1 with layer_sharding and FULL_DECODE_ONLY in ds32 testing (#6115)

### What this PR does / why we need it? This PR enables FLASHCOMM1 communication optimization with layer sharding for DeepSeek-V3.2 W8A8 model testing to validate PR #5702. The changes include: 1. Enable FLASHCOMM1: Set VLLM_ASCEND_ENABLE_FLASHCOMM1=1 improves performance for distributed inference 2. Add layer sharding: Configure layer_sharding: ["q_b_proj", "o_proj"] 4. Update baselines: Adjust performance baselines to reflect the improvements from FLASHCOMM1 and layer sharding ### Does this PR introduce _any_ user-facing change? No. This is a CI/test-only change that enables new communication optimization features for testing purposes. ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: d68209402d Signed-off-by: guozr <guozr1997@hotmail.com> Co-authored-by: guozr <guozr1997@hotmail.com>
2026-01-23 19:48:37 +08:00
parent 8786412f5c
commit 6c73b88dd6
3 changed files with 13 additions and 5 deletions
--- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml
+++ b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml
@@ -11,7 +11,7 @@ env_common:
  OMP_PROC_BIND: false
  OMP_NUM_THREADS: 1
  PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
-  VLLM_ASCEND_ENABLE_FLASHCOMM1: 0
+  VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
  ASCEND_A3_EBA_ENABLE: 1


@@ -37,6 +37,7 @@ deployment:
      --trust-remote-code
      --speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
      --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' 
+      --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
      --tokenizer-mode deepseek_v32
      --reasoning-parser deepseek_v3

@@ -61,6 +62,7 @@ deployment:
      --trust-remote-code
      --speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
      --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' 
+      --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
      --tokenizer-mode deepseek_v32
      --reasoning-parser deepseek_v3
 benchmarks:
@@ -73,8 +75,9 @@ benchmarks:
    max_out_len: 3000
    batch_size: 512
    request_rate: 11.2
-    baseline: 905.6805
+    baseline: 1253.8466 
    threshold: 0.97
+    
  acc:
    case_type: accuracy
    dataset_path: vllm-ascend/gsm8k-lite