【main】【bugfix】fix: restrict default MLAPO activation to Decode nodes only (#6451)
### What this PR does / why we need it?
There is an issue with the current default logic for MLAPO (MLA Policy
Optimization). By design, MLAPO should only be enabled by default on
Decode (D) nodes. However, in hybrid (collocated prefill and decode)
scenarios, the strategy is erroneously activated during the Prefill
stage.
This PR corrects the default behavior to ensure that MLAPO is
exclusively enabled for the Decoding phase. This prevents unexpected
policy interference during Prefill and ensures optimal performance in
hybrid deployment environments.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: fems14 <1804143737@qq.com>
This commit is contained in:
@@ -22,7 +22,7 @@ from vllm_ascend.attention.utils import (
|
|||||||
AscendCommonAttentionMetadata,
|
AscendCommonAttentionMetadata,
|
||||||
ascend_chunked_prefill_workspace_size,
|
ascend_chunked_prefill_workspace_size,
|
||||||
enable_cp,
|
enable_cp,
|
||||||
enabling_malpo,
|
enabling_mlapo,
|
||||||
maybe_save_kv_layer_to_connector,
|
maybe_save_kv_layer_to_connector,
|
||||||
split_decodes_and_prefills,
|
split_decodes_and_prefills,
|
||||||
trans_rope_weight,
|
trans_rope_weight,
|
||||||
@@ -710,7 +710,7 @@ class AscendMLAImpl(MLAAttentionImpl):
|
|||||||
self.ring_mla_mask_size = 512
|
self.ring_mla_mask_size = 512
|
||||||
|
|
||||||
self.speculative_config = self.vllm_config.speculative_config
|
self.speculative_config = self.vllm_config.speculative_config
|
||||||
self.enable_mlapo = enabling_malpo(self.vllm_config)
|
self.enable_mlapo = enabling_mlapo(self.vllm_config)
|
||||||
|
|
||||||
self.is_kv_producer = (
|
self.is_kv_producer = (
|
||||||
self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
|
self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
|
||||||
|
|||||||
@@ -305,6 +305,10 @@ def transdata(nd_mat, block_size: tuple = (16, 16)):
|
|||||||
return nz_mat
|
return nz_mat
|
||||||
|
|
||||||
|
|
||||||
def enabling_malpo(vllm_config: VllmConfig) -> bool:
|
def enabling_mlapo(vllm_config: VllmConfig) -> bool:
|
||||||
is_decode_instance = vllm_config.kv_transfer_config is not None and vllm_config.kv_transfer_config.is_kv_consumer
|
is_decode_instance = (
|
||||||
|
vllm_config.kv_transfer_config is not None
|
||||||
|
and vllm_config.kv_transfer_config.is_kv_consumer
|
||||||
|
and not vllm_config.kv_transfer_config.is_kv_producer
|
||||||
|
)
|
||||||
return bool(envs.VLLM_ASCEND_ENABLE_MLAPO and is_decode_instance)
|
return bool(envs.VLLM_ASCEND_ENABLE_MLAPO and is_decode_instance)
|
||||||
|
|||||||
Reference in New Issue
Block a user