From 775fbc4cd21718b53a033856822ff9f53b6b28cc Mon Sep 17 00:00:00 2001 From: fems14 <74094523+fems14@users.noreply.github.com> Date: Sat, 31 Jan 2026 22:44:56 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90main=E3=80=91=E3=80=90bugfix=E3=80=91f?= =?UTF-8?q?ix:=20restrict=20default=20MLAPO=20activation=20to=20Decode=20n?= =?UTF-8?q?odes=20only=20(#6451)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? There is an issue with the current default logic for MLAPO (MLA Policy Optimization). By design, MLAPO should only be enabled by default on Decode (D) nodes. However, in hybrid (collocated prefill and decode) scenarios, the strategy is erroneously activated during the Prefill stage. This PR corrects the default behavior to ensure that MLAPO is exclusively enabled for the Decoding phase. This prevents unexpected policy interference during Prefill and ensures optimal performance in hybrid deployment environments. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd --------- Signed-off-by: fems14 <1804143737@qq.com> --- vllm_ascend/attention/mla_v1.py | 4 ++-- vllm_ascend/attention/utils.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 5414ee58..03834212 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -22,7 +22,7 @@ from vllm_ascend.attention.utils import ( AscendCommonAttentionMetadata, ascend_chunked_prefill_workspace_size, enable_cp, - enabling_malpo, + enabling_mlapo, maybe_save_kv_layer_to_connector, split_decodes_and_prefills, trans_rope_weight, @@ -710,7 +710,7 @@ class AscendMLAImpl(MLAAttentionImpl): self.ring_mla_mask_size = 512 self.speculative_config = self.vllm_config.speculative_config - self.enable_mlapo = enabling_malpo(self.vllm_config) + self.enable_mlapo = enabling_mlapo(self.vllm_config) self.is_kv_producer = ( self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py index 8414fc5d..8eb91ec5 100644 --- a/vllm_ascend/attention/utils.py +++ b/vllm_ascend/attention/utils.py @@ -305,6 +305,10 @@ def transdata(nd_mat, block_size: tuple = (16, 16)): return nz_mat -def enabling_malpo(vllm_config: VllmConfig) -> bool: - is_decode_instance = vllm_config.kv_transfer_config is not None and vllm_config.kv_transfer_config.is_kv_consumer +def enabling_mlapo(vllm_config: VllmConfig) -> bool: + is_decode_instance = ( + vllm_config.kv_transfer_config is not None + and vllm_config.kv_transfer_config.is_kv_consumer + and not vllm_config.kv_transfer_config.is_kv_producer + ) return bool(envs.VLLM_ASCEND_ENABLE_MLAPO and is_decode_instance)