Default enable MLAPO (#5952)
### What this PR does / why we need it?
1) Default enable MLAPO for deepseek MLA Attention W8A8 models on PD
disagregation D Instance, for example: DeepSeekV3-W8A8,
DeepSeek-R1-W8A8.
2) Default enable MLAPO for DeepSeek SFA Attention W8A8 models,
currently is DeepSeek-V3.2-W8A8.
### Does this PR introduce _any_ user-facing change?
Don't need use manully to VLLM_ASCEND_ENABLE_MLAPO=1, to enable MLAPO
feature for deepseek w8a8 model
The effect of enabling MLAPO SFA model deployed on a single A3 Node:
Test
with:tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py
dataset: gsm8k-lite,without set MTP, FULL GRAPH, has 19% promote:
未默认开启 MLAPO 时:
├─────────────────────────┤
│ TTFT │ 14055.8836 ms │
├─────────────────────────┤
│ ITL │ 66.8171 ms. │
├─────────────────────────┤
│ Output Token Throughput │ 104.9105 token/s │
├─────────────────────────┤
默认开启 MLAPO 时:
├─────────────────────────┤
│ TTFT │ 3753.1547 ms │
├─────────────────────────┤
│ ITL. │ 61.4236 ms. │
├─────────────────────────┤
│ Output Token Throughput │ 125.2075 token/s│
├─────────────────────────┤
- vLLM version: v0.13.0
- vLLM main:
2c24bc6996
---------
Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
@@ -22,7 +22,8 @@ from vllm_ascend.attention.context_parallel.common_cp import (
|
||||
from vllm_ascend.attention.utils import (
|
||||
AscendCommonAttentionMetadata, ascend_chunked_prefill_workspace_size,
|
||||
enable_cp, maybe_save_kv_layer_to_connector, split_decodes_and_prefills,
|
||||
trans_rope_weight, transdata, wait_for_kv_layer_from_connector)
|
||||
trans_rope_weight, transdata, wait_for_kv_layer_from_connector,
|
||||
enabling_malpo)
|
||||
from vllm_ascend.compilation.acl_graph import (
|
||||
get_draft_graph_params, get_graph_params,
|
||||
update_draft_graph_params_workspaces, update_graph_params_workspaces)
|
||||
@@ -741,7 +742,7 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
self.ring_mla_mask_size = 512
|
||||
|
||||
self.speculative_config = self.vllm_config.speculative_config
|
||||
self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO
|
||||
self.enable_mlapo = enabling_malpo(self.vllm_config)
|
||||
|
||||
self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
|
||||
self.layer_sharding_kwargs = []
|
||||
@@ -1491,7 +1492,6 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
|
||||
# MLA Preprocess
|
||||
if self.enable_mlapo and \
|
||||
not has_prefill and \
|
||||
attn_metadata.num_decode_tokens <= MLAPO_MAX_SUPPORTED_TOKENS:
|
||||
hidden_states = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
|
||||
hidden_states.contiguous(), need_gather_q_kv)
|
||||
|
||||
Reference in New Issue
Block a user