Default enable MLAPO (#5952)

### What this PR does / why we need it? 1) Default enable MLAPO for deepseek MLA Attention W8A8 models on PD disagregation D Instance, for example: DeepSeekV3-W8A8, DeepSeek-R1-W8A8. 2) Default enable MLAPO for DeepSeek SFA Attention W8A8 models, currently is DeepSeek-V3.2-W8A8. ### Does this PR introduce _any_ user-facing change? Don't need use manully to VLLM_ASCEND_ENABLE_MLAPO=1, to enable MLAPO feature for deepseek w8a8 model The effect of enabling MLAPO SFA model deployed on a single A3 Node: Test with:tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py dataset: gsm8k-lite，without set MTP, FULL GRAPH, has 19% promote：未默认开启 MLAPO 时： ├─────────────────────────┤ │ TTFT │ 14055.8836 ms │ ├─────────────────────────┤ │ ITL │ 66.8171 ms. │ ├─────────────────────────┤ │ Output Token Throughput │ 104.9105 token/s │ ├─────────────────────────┤ 默认开启 MLAPO 时： ├─────────────────────────┤ │ TTFT │ 3753.1547 ms │ ├─────────────────────────┤ │ ITL. │ 61.4236 ms. │ ├─────────────────────────┤ │ Output Token Throughput │ 125.2075 token/s│ ├─────────────────────────┤ - vLLM version: v0.13.0 - vLLM main: 2c24bc6996 --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-01-22 09:26:39 +08:00
parent a15a5f6aa5
commit ab676413e6
13 changed files with 17 additions and 29 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -22,7 +22,8 @@ from vllm_ascend.attention.context_parallel.common_cp import (
 from vllm_ascend.attention.utils import (
    AscendCommonAttentionMetadata, ascend_chunked_prefill_workspace_size,
    enable_cp, maybe_save_kv_layer_to_connector, split_decodes_and_prefills,
-    trans_rope_weight, transdata, wait_for_kv_layer_from_connector)
+    trans_rope_weight, transdata, wait_for_kv_layer_from_connector,
+    enabling_malpo)
 from vllm_ascend.compilation.acl_graph import (
    get_draft_graph_params, get_graph_params,
    update_draft_graph_params_workspaces, update_graph_params_workspaces)
@@ -741,7 +742,7 @@ class AscendMLAImpl(MLAAttentionImpl):
        self.ring_mla_mask_size = 512

        self.speculative_config = self.vllm_config.speculative_config
-        self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO
+        self.enable_mlapo = enabling_malpo(self.vllm_config)

        self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
        self.layer_sharding_kwargs = []
@@ -1491,7 +1492,6 @@ class AscendMLAImpl(MLAAttentionImpl):

        # MLA Preprocess
        if self.enable_mlapo and \
-            not has_prefill and \
            attn_metadata.num_decode_tokens <= MLAPO_MAX_SUPPORTED_TOKENS:
            hidden_states = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
                hidden_states.contiguous(), need_gather_q_kv)
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -375,6 +375,9 @@ class AscendSFAImpl(MLAAttentionImpl):
        ascend_config = get_ascend_config()
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
        self.enable_prefetch = ascend_config.weight_prefetch_config.enabled
+
+        # In sfa, prefill and decode have the same calculation formula,
+        # so do not distinguish between prefill and decode here.
        self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO

        assert self.indexer is not None, "Indexer is required for DSA."
--- a/vllm_ascend/attention/utils.py
+++ b/vllm_ascend/attention/utils.py
@@ -9,6 +9,7 @@ from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata

+from vllm_ascend import envs
 from vllm_ascend.utils import AscendDeviceType, get_ascend_config, get_ascend_device_type


@@ -302,3 +303,8 @@ def transdata(nd_mat, block_size: tuple = (16, 16)):
    )
    nz_mat = torch.reshape(nz_mat, (nz_mat.shape[0], nz_mat.shape[1] * nz_mat.shape[2], nz_mat.shape[3]))
    return nz_mat
+
+
+def enabling_malpo(vllm_config: VllmConfig) -> bool:
+    is_decode_instance = vllm_config.kv_transfer_config is not None and vllm_config.kv_transfer_config.is_kv_consumer
+    return bool(envs.VLLM_ASCEND_ENABLE_MLAPO and is_decode_instance)
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -92,7 +92,11 @@ env_variables: dict[str, Callable[[], Any]] = {
    ),
    # Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
    "MSMONITOR_USE_DAEMON": lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", "0"))),
-    "VLLM_ASCEND_ENABLE_MLAPO": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", "0"))),
+    # Whether to enable MLAPO optimization for DeepSeek W8A8 series models.
+    # This option is enabled by default. MLAPO can improve performance, but
+    # it will consume more NPU memory. If reducing NPU memory usage is a higher priority
+    # for your DeepSeek W8A8 scene, then disable it.
+    "VLLM_ASCEND_ENABLE_MLAPO": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", "1"))),
    # Whether to enable weight cast format to FRACTAL_NZ.
    # 0: close nz;
    # 1: only quant case enable nz;