From 6c73b88dd696d2e0417f50458301b7a605efaf9f Mon Sep 17 00:00:00 2001 From: starmountain1997 <77533802+starmountain1997@users.noreply.github.com> Date: Fri, 23 Jan 2026 19:48:37 +0800 Subject: [PATCH] [CI] Enable FLASHCOMM1 with layer_sharding and FULL_DECODE_ONLY in ds32 testing (#6115) ### What this PR does / why we need it? This PR enables FLASHCOMM1 communication optimization with layer sharding for DeepSeek-V3.2 W8A8 model testing to validate PR #5702. The changes include: 1. Enable FLASHCOMM1: Set VLLM_ASCEND_ENABLE_FLASHCOMM1=1 improves performance for distributed inference 2. Add layer sharding: Configure layer_sharding: ["q_b_proj", "o_proj"] 4. Update baselines: Adjust performance baselines to reflect the improvements from FLASHCOMM1 and layer sharding ### Does this PR introduce _any_ user-facing change? No. This is a CI/test-only change that enables new communication optimization features for testing purposes. ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 Signed-off-by: guozr Co-authored-by: guozr --- .../2-cards/test_offline_inference_distributed.py | 5 ++++- .../config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml | 7 +++++-- .../nightly/single_node/models/test_deepseek_v3_2_w8a8.py | 6 ++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py index 1b617e47..e9032e99 100644 --- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py @@ -242,7 +242,7 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model): @patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"}) -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0"}) +@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) @patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"}) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep(): @@ -262,6 +262,9 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep(): "num_speculative_tokens": 2, "method": "deepseek_mtp" }, + additional_config={ + "layer_sharding":["q_b_proj", "o_proj"] + }, reasoning_parser="deepseek_v3", tokenizer_mode="deepseek_v32") as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml index 0bb313be..d7106dfd 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml @@ -11,7 +11,7 @@ env_common: OMP_PROC_BIND: false OMP_NUM_THREADS: 1 PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" - VLLM_ASCEND_ENABLE_FLASHCOMM1: 0 + VLLM_ASCEND_ENABLE_FLASHCOMM1: 1 ASCEND_A3_EBA_ENABLE: 1 @@ -37,6 +37,7 @@ deployment: --trust-remote-code --speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}' --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' + --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' --tokenizer-mode deepseek_v32 --reasoning-parser deepseek_v3 @@ -61,6 +62,7 @@ deployment: --trust-remote-code --speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}' --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' + --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' --tokenizer-mode deepseek_v32 --reasoning-parser deepseek_v3 benchmarks: @@ -73,8 +75,9 @@ benchmarks: max_out_len: 3000 batch_size: 512 request_rate: 11.2 - baseline: 905.6805 + baseline: 1253.8466 threshold: 0.97 + acc: case_type: accuracy dataset_path: vllm-ascend/gsm8k-lite diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py index 8f0b2f64..7559e2da 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py @@ -54,7 +54,7 @@ aisbench_cases = [{ "max_out_len": 1500, "batch_size": 4, "request_rate": 11.2, - "baseline": 120, + "baseline": 134, "threshold": 0.97 }] @@ -72,7 +72,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None: "HCCL_BUFFSIZE": "1024", "VLLM_ASCEND_ENABLE_MLAPO": "1", "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - "VLLM_ASCEND_ENABLE_FLASHCOMM1": "0", + "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1", } server_args = [ @@ -85,6 +85,8 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None: '{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}', "--speculative-config", '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}', + "--additional-config", + '{"layer_sharding": ["q_b_proj", "o_proj"]}', "--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32" ] request_keyword_args: dict[str, Any] = {