Default enable MLAPO (#5952)

### What this PR does / why we need it? 1) Default enable MLAPO for deepseek MLA Attention W8A8 models on PD disagregation D Instance, for example: DeepSeekV3-W8A8, DeepSeek-R1-W8A8. 2) Default enable MLAPO for DeepSeek SFA Attention W8A8 models, currently is DeepSeek-V3.2-W8A8. ### Does this PR introduce _any_ user-facing change? Don't need use manully to VLLM_ASCEND_ENABLE_MLAPO=1, to enable MLAPO feature for deepseek w8a8 model The effect of enabling MLAPO SFA model deployed on a single A3 Node: Test with:tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py dataset: gsm8k-lite，without set MTP, FULL GRAPH, has 19% promote：未默认开启 MLAPO 时： ├─────────────────────────┤ │ TTFT │ 14055.8836 ms │ ├─────────────────────────┤ │ ITL │ 66.8171 ms. │ ├─────────────────────────┤ │ Output Token Throughput │ 104.9105 token/s │ ├─────────────────────────┤ 默认开启 MLAPO 时： ├─────────────────────────┤ │ TTFT │ 3753.1547 ms │ ├─────────────────────────┤ │ ITL. │ 61.4236 ms. │ ├─────────────────────────┤ │ Output Token Throughput │ 125.2075 token/s│ ├─────────────────────────┤ - vLLM version: v0.13.0 - vLLM main: 2c24bc6996 --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-01-22 09:26:39 +08:00
parent a15a5f6aa5
commit ab676413e6
13 changed files with 17 additions and 29 deletions
--- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml
+++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml
@@ -3,7 +3,6 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
 num_nodes: 2
 npu_per_node: 8
 env_common:
-  VLLM_ASCEND_ENABLE_MLAPO: 1
  VLLM_ASCEND_BALANCE_SCHEDULING: 1
  HCCL_INTRA_PCIE_ENABLE: 1
  HCCL_INTRA_ROCE_ENABLE: 0
--- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml
+++ b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml
@@ -10,7 +10,6 @@ env_common:
  SERVER_PORT: 8080
  OMP_PROC_BIND: false
  OMP_NUM_THREADS: 1
-  VLLM_ASCEND_ENABLE_MLAPO: 1
  PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
  VLLM_ASCEND_ENABLE_FLASHCOMM1: 0
  ASCEND_A3_EBA_ENABLE: 1
--- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py
@@ -31,7 +31,6 @@ MODELS = [
 MODES = [
    "single",
    "aclgraph",
-    "aclgraph_mlapo",
 ]

 prompts = [
@@ -88,8 +87,6 @@ async def test_models(model: str, mode: str) -> None:
    ]
    if mode == "single":
        server_args.append("--enforce-eager")
-    if mode == "aclgraph_mlapo":
-        env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py
@@ -56,7 +56,6 @@ async def test_models(model: str) -> None:
        "OMP_NUM_THREADS": "100",
        "OMP_PROC_BIND": "false",
        "HCCL_BUFFSIZE": "200",
-        "VLLM_ASCEND_ENABLE_MLAPO": "1",
        "VLLM_RPC_TIMEOUT": "3600000",
        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000",
        "DISABLE_L2_CACHE": "1",