[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125)

### What this PR does / why we need it? Adds support for capturing the Multi-Layer Attention (MLA) decode operation into an ACL graph. This improves performance by compiling the attention kernel for single-token decoding. Key changes include: - Implementing the graph capture logic for the MLA kernel, including workspace management and parameter updates. - Modifying the rotary embedding (RoPE) handling to use pre-allocated tensors, which is a requirement for graph capture. - Adding a `build_for_graph_capture` method to the MLA metadata builder to create dummy metadata during the graph compilation phase. Known issues: - Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're working on a fix - We are preparing to remove update_mla_attn_params with auto_dispatch_capture ### Does this PR introduce _any_ user-facing change? compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", }, ### How was this patch tested? - vLLM version: v0.11.0 --------- Signed-off-by: panchao-hub <315134829@qq.com> Signed-off-by: p00465316 <panchao13@huawei.com> Co-authored-by: p00465316 <panchao13@huawei.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-10-10 16:31:20 +08:00
parent ba19dd3183
commit 1756efa5fd
8 changed files with 303 additions and 50 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -104,7 +104,8 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
                                               set_graph_params,
-                                               update_attn_params)
+                                               update_attn_params,
+                                               update_mla_attn_params)
 from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
 from vllm_ascend.eplb.core.eplb_device_transfer_loader import \
    D2DExpertWeightLoader
@@ -358,6 +359,25 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                        dtype=torch.int32,
                                        device=self.device)

+        if self.vllm_config.model_config.use_mla and \
+            self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+            rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
+            self.cos = torch.ones(self.max_num_reqs,
+                                  1,
+                                  1,
+                                  rope_dim,
+                                  dtype=self.dtype,
+                                  device=self.device)
+            self.sin = torch.zeros(self.max_num_reqs,
+                                   1,
+                                   1,
+                                   rope_dim,
+                                   dtype=self.dtype,
+                                   device=self.device)
+        else:
+            self.cos = None
+            self.sin = None
+
        self.uses_mrope = self.model_config.uses_mrope
        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
        if self.uses_mrope:
@@ -1427,6 +1447,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                max_query_len=max_num_scheduled_tokens,
                graph_pad_size=self.graph_pad_size,
                decode_token_per_req=self.decode_token_per_req,
+                cos=self.cos,
+                sin=self.sin,
            )

            if self.speculative_config and \
@@ -1453,7 +1475,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                    attn_metadata_i = builder.build(
                        common_prefix_len=common_prefix_len,
                        common_attn_metadata=common_attn_metadata,
-                        model=self.model,
+                        model=self.get_model(),
                        **extra_attn_metadata_args)

                if self.vllm_config.model_config.use_mla or self.ascend_config.use_sfa:
@@ -1488,8 +1510,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        forward_context = get_forward_context()
        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
-            update_attn_params(self.update_stream, forward_context,
-                               positions.shape[0])
+            if self.vllm_config.model_config.use_mla:
+                # FIXME: Try using `auto_dispatch_capture=True`
+                update_mla_attn_params(self.update_stream, forward_context,
+                                       positions.shape[0])
+            else:
+                update_attn_params(self.update_stream, forward_context,
+                                   positions.shape[0])

        if get_forward_context().sp_enabled:
            hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
@@ -2195,14 +2222,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                    block_table_tensor=block_table_tensor[:num_reqs],
                    slot_mapping=self.slot_mapping,
                    num_computed_tokens_cpu=num_computed_tokens_cpu,
+                    positions=self.positions,
+                    attn_mask=self.attn_mask,
+                    spec_attn_mask=self.spec_attn_mask,
+                    attn_state=self.attn_state,
                    max_query_len=max_query_len,
                    decode_token_per_req=self.decode_token_per_req,
+                    cos=self.cos,
+                    sin=self.sin,
                )

                for attn_group in self.attn_groups[kv_cache_group_id]:
                    builder = attn_group.get_metadata_builder()
                    attn_metadata_i = builder.build_for_graph_capture(
-                        common_attn_metadata)
+                        common_attn_metadata, AscendAttentionState.DecodeOnly,
+                        self.get_model())
                    for layer_name in kv_cache_group_spec.layer_names:
                        attn_metadata[layer_name] = attn_metadata_i

@@ -2218,9 +2252,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                   inputs_embeds=inputs_embeds)
        forward_context = get_forward_context()
        assert forward_context is not None
-        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
-            update_attn_params(self.update_stream, forward_context,
-                               positions.shape[0])
+        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
+            not forward_context.capturing:
+            if self.vllm_config.model_config.use_mla:
+                # FIXME: Try using `auto_dispatch_capture=True`
+                update_mla_attn_params(self.update_stream, forward_context,
+                                       positions.shape[0])
+            else:
+                update_attn_params(self.update_stream, forward_context,
+                                   positions.shape[0])

        if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
            hidden_states, _ = hidden_states