[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125)
### What this PR does / why we need it?
Adds support for capturing the Multi-Layer Attention (MLA) decode
operation into an ACL graph. This improves performance by compiling the
attention kernel for single-token decoding.
Key changes include:
- Implementing the graph capture logic for the MLA kernel, including
workspace management and parameter updates.
- Modifying the rotary embedding (RoPE) handling to use pre-allocated
tensors, which is a requirement for graph capture.
- Adding a `build_for_graph_capture` method to the MLA metadata builder
to create dummy metadata during the graph compilation phase.
Known issues:
- Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're
working on a fix
- We are preparing to remove update_mla_attn_params with
auto_dispatch_capture
### Does this PR introduce _any_ user-facing change?
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
},
### How was this patch tested?
- vLLM version: v0.11.0
---------
Signed-off-by: panchao-hub <315134829@qq.com>
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -104,7 +104,8 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
|
||||
set_graph_params,
|
||||
update_attn_params)
|
||||
update_attn_params,
|
||||
update_mla_attn_params)
|
||||
from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
|
||||
from vllm_ascend.eplb.core.eplb_device_transfer_loader import \
|
||||
D2DExpertWeightLoader
|
||||
@@ -358,6 +359,25 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dtype=torch.int32,
|
||||
device=self.device)
|
||||
|
||||
if self.vllm_config.model_config.use_mla and \
|
||||
self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
|
||||
rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
|
||||
self.cos = torch.ones(self.max_num_reqs,
|
||||
1,
|
||||
1,
|
||||
rope_dim,
|
||||
dtype=self.dtype,
|
||||
device=self.device)
|
||||
self.sin = torch.zeros(self.max_num_reqs,
|
||||
1,
|
||||
1,
|
||||
rope_dim,
|
||||
dtype=self.dtype,
|
||||
device=self.device)
|
||||
else:
|
||||
self.cos = None
|
||||
self.sin = None
|
||||
|
||||
self.uses_mrope = self.model_config.uses_mrope
|
||||
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||
if self.uses_mrope:
|
||||
@@ -1427,6 +1447,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
max_query_len=max_num_scheduled_tokens,
|
||||
graph_pad_size=self.graph_pad_size,
|
||||
decode_token_per_req=self.decode_token_per_req,
|
||||
cos=self.cos,
|
||||
sin=self.sin,
|
||||
)
|
||||
|
||||
if self.speculative_config and \
|
||||
@@ -1453,7 +1475,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
attn_metadata_i = builder.build(
|
||||
common_prefix_len=common_prefix_len,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
model=self.model,
|
||||
model=self.get_model(),
|
||||
**extra_attn_metadata_args)
|
||||
|
||||
if self.vllm_config.model_config.use_mla or self.ascend_config.use_sfa:
|
||||
@@ -1488,8 +1510,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
forward_context = get_forward_context()
|
||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
positions.shape[0])
|
||||
if self.vllm_config.model_config.use_mla:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
update_mla_attn_params(self.update_stream, forward_context,
|
||||
positions.shape[0])
|
||||
else:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
positions.shape[0])
|
||||
|
||||
if get_forward_context().sp_enabled:
|
||||
hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
|
||||
@@ -2195,14 +2222,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
block_table_tensor=block_table_tensor[:num_reqs],
|
||||
slot_mapping=self.slot_mapping,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
positions=self.positions,
|
||||
attn_mask=self.attn_mask,
|
||||
spec_attn_mask=self.spec_attn_mask,
|
||||
attn_state=self.attn_state,
|
||||
max_query_len=max_query_len,
|
||||
decode_token_per_req=self.decode_token_per_req,
|
||||
cos=self.cos,
|
||||
sin=self.sin,
|
||||
)
|
||||
|
||||
for attn_group in self.attn_groups[kv_cache_group_id]:
|
||||
builder = attn_group.get_metadata_builder()
|
||||
attn_metadata_i = builder.build_for_graph_capture(
|
||||
common_attn_metadata)
|
||||
common_attn_metadata, AscendAttentionState.DecodeOnly,
|
||||
self.get_model())
|
||||
for layer_name in kv_cache_group_spec.layer_names:
|
||||
attn_metadata[layer_name] = attn_metadata_i
|
||||
|
||||
@@ -2218,9 +2252,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
inputs_embeds=inputs_embeds)
|
||||
forward_context = get_forward_context()
|
||||
assert forward_context is not None
|
||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
positions.shape[0])
|
||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
|
||||
not forward_context.capturing:
|
||||
if self.vllm_config.model_config.use_mla:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
update_mla_attn_params(self.update_stream, forward_context,
|
||||
positions.shape[0])
|
||||
else:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
positions.shape[0])
|
||||
|
||||
if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
|
||||
hidden_states, _ = hidden_states
|
||||
|
||||
Reference in New Issue
Block a user