[Feat] Support MTP to running in full graph mode (#3892)

### What this PR does / why we need it? Currently, the MTP model still runs in eager in full graph mode. This PR adapts the MTP with the full graph capture and execution. When the graph mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to improve the performance. The change in both disable_padded_drafter_batch is True and False case include: 1. Add _mtp_graph_params in acl_graph.py to isolate the data of main model and the data of MTP. 2. Padding some metadata in mla_v1.py when in fullgraph mode. 3. Fixed the essential data address that will be used in model.forward. 4. Adapted according to the aclgraph capture framwork: 1). Rebuild MTP model with ACLGraphWrapper. 2). Add common attn metadata when start capture in MTP dummy_run. 3). Add common attn metadata update in MTP. 4). Addapted data update when num_speculative_tokens > 1. 5. Add a patch of MTP to adapt vllm v0.11.0. Existing Issues: 1. When disable_padded_drafter_batch=True and running in FullGraph mode, the data of the first-round requests in MTP is abnormal. We need to identify the cause subsequently. 2. When disable_padded_drafter_batch=False and running in FullGraph mode, the acceptance rate of the second and third tokens will decrease (For example, if we set the num_speculative_tokens=3, the acceptance rate of first token is 90%, the second is only 50% lower than 60%, the third is only 20% lower than 30%). The reason is that the data processed after the model runs does not match. This is a problem from another PR. It works fine in eager and PIECEWISE mode, but has problem in FullGraph mode. Once we have a solution, we will submit a bugfix. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
parent 15c1eb025c
commit 5c9f4a40c6
8 changed files with 536 additions and 42 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -8,6 +8,7 @@ import torch.distributed as dist
 import torch_npu
 from torch import nn
 from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
+from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import (get_dcp_group,
                              get_decode_context_model_parallel_rank,
@@ -38,6 +39,7 @@ from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
                                         trans_rope_weight, transdata,
                                         wait_for_kv_layer_from_connector)
 from vllm_ascend.compilation.acl_graph import (get_graph_params,
+                                               get_mtp_graph_params,
                                               update_graph_params_workspaces)
 from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
 from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
@@ -337,6 +339,74 @@ class AscendMLAMetadataBuilder:
        # better way of doing this
        return modified_batch

+    def pad_actual_seq_len_q_mtp_enable_pad(self, num_reqs_pad_size, num_reqs,
+                                            actual_seq_lengths_q,
+                                            common_attn_metadata):
+        """
+        Pads actual_seq_lengths_q evenly to not exceed 16 tokens per request 
+        in order to meet the requirement of npu_fused_infer_attention_score.
+
+        In Torchair scenario, the lengths of the queries must be padded to the same length.
+        And npu_fused_infer_attention_score constraint requires the last element must equal to batch_size(num_tokens).
+
+        For example:
+        batch_size=36, num_reqs_pad_size=2, num_reqs=16
+        By default, each request should have inference 2 token, which means actual_seq_lengths_q should be 
+        [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36].
+
+        However, mtp torchair + PD scenario, the actual_seq_lengths_q may be 
+        [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] before padding, since the first decode request only has 1 token.
+        In order to meet the requirement of npu_fused_infer_attention_score, we need to pad actual_seq_lengths_q evenly to not exceed 16 tokens per request.
+        after padding actual_seq_lengths_q should be similar to [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,36]
+        """
+        FIA_SEQ_LEN_LIMIT = 16
+        need_padding = num_reqs_pad_size != 0 and \
+            len(common_attn_metadata.actual_seq_lengths_q) > num_reqs and \
+            common_attn_metadata.actual_seq_lengths_q[num_reqs] - actual_seq_lengths_q[-1] > FIA_SEQ_LEN_LIMIT
+        if need_padding:
+            padding_seq_len_q = common_attn_metadata.actual_seq_lengths_q[
+                num_reqs:num_reqs + num_reqs_pad_size]
+            start_val = actual_seq_lengths_q[-1]
+            end_val = padding_seq_len_q[-1]
+
+            num_step = len(padding_seq_len_q)
+            interpolated = np.round(
+                np.linspace(start_val, end_val,
+                            num_step + 1)[1:]).astype(int).tolist()
+            assert interpolated[-1] == end_val
+            assert len(interpolated) == len(padding_seq_len_q)
+            actual_seq_lengths_q = actual_seq_lengths_q + interpolated
+        else:
+            actual_seq_lengths_q = actual_seq_lengths_q + common_attn_metadata.actual_seq_lengths_q[
+                num_reqs:num_reqs + num_reqs_pad_size]
+
+        return actual_seq_lengths_q
+
+    def pad_actual_seq_len_q_mtp_disable_pad(self, num_reqs_pad_size, num_reqs,
+                                             actual_seq_lengths_q):
+        """
+        Only use for acl full graph mode.
+        Pad the last element of the actual_seq_lengths_q equal to the TND(T) and
+        the num of dimensions equal to the batch_size of main model.
+        
+        For example:
+        batch_size = 8, num_reqs = 4, num_speculative_tokens = 1
+        input actual_seq_lengths_q = [1, 2, 4, 5]  (the 3rd req was accept a token)
+        After padding the actual_seq_lengths_q will be similar to [1, 2, 4, 5, 6, 6, 7, 8]
+        """
+        need_padding = num_reqs_pad_size > 0
+        if need_padding:
+            start_val = actual_seq_lengths_q[-1]
+            end_val = num_reqs + num_reqs_pad_size
+            num_step = num_reqs_pad_size
+            interpolated = np.round(
+                np.linspace(start_val, end_val,
+                            num_step + 1)[1:]).astype(int).tolist()
+            assert interpolated[-1] == end_val
+            assert len(interpolated) == num_reqs_pad_size
+            actual_seq_lengths_q = actual_seq_lengths_q + interpolated
+        return actual_seq_lengths_q
+
    def build(
        self,
        common_prefix_len: int,
@@ -362,17 +432,25 @@ class AscendMLAMetadataBuilder:
        # it blocks on all previous kernels.
        device = self.device

-        block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
+        # If graph_pad_size > -1, mean is running in fullgraph mode.
+        graph_pad_size = common_attn_metadata.graph_pad_size
+        # NOTE: Maybe this block_table change can be removed when graph_pad_size > 1.
+        if graph_pad_size > num_reqs and self.speculative_config.disable_padded_drafter_batch:
+            block_table = (
+                common_attn_metadata.block_table_tensor[:graph_pad_size])
+        else:
+            block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
+        # NOTE: Currently, MTP-fullgraph is incompatibility pcp
        if self.pcp_size > 1:
            num_decodes_flatten = num_decodes * self.decode_threshold
            block_table = common_attn_metadata.block_table_tensor[:
                                                                  num_decodes_flatten
                                                                  +
                                                                  num_prefills]
-
        if num_actual_tokens_pcp_padded is None:
            num_actual_tokens_pcp_padded = num_actual_tokens

+        # NOTE: Currently, MTP-fullgraph is incompatibility pcp
        slot_mapping = common_attn_metadata.slot_mapping[:
                                                         num_actual_tokens_pcp_padded]
        input_positions = common_attn_metadata.positions[:
@@ -565,6 +643,11 @@ class AscendMLAMetadataBuilder:
                block_table = block_table[:num_decodes_flatten, ...]
            else:
                block_table = block_table[:num_decodes, ...]
+            # NOTE: Currently, MTP-fullgraph is incompatibility pcp
+            # NOTE: Maybe this block_table change can be removed when graph_pad_size > 1.
+            if graph_pad_size > num_decodes and \
+                    self.speculative_config.disable_padded_drafter_batch:
+                block_table = block_table[:graph_pad_size, ...]
            seq_lens_list = seq_lens.tolist()

            if num_computed_tokens_of_pcp_dcp is not None:
@@ -586,6 +669,52 @@ class AscendMLAMetadataBuilder:
            else:
                cp_seq_len, batch_seq_mask = None, None

+            if graph_pad_size > num_reqs:
+                if self.speculative_config.disable_padded_drafter_batch:
+                    num_reqs_pad_size = graph_pad_size - num_reqs
+                    actual_seq_lengths_q = self.pad_actual_seq_len_q_mtp_disable_pad(
+                        num_reqs_pad_size, num_reqs, actual_seq_lengths_q)
+                    seq_lens_list = seq_lens_list + [0] * (graph_pad_size - \
+                                                           num_decodes)
+                    num_block_pad_size = graph_pad_size - block_table.shape[0]
+                    if num_block_pad_size > 0:
+                        block_table_padding = torch.zeros(
+                            (num_block_pad_size, ) + block_table.shape[1:],
+                            dtype=block_table.dtype,
+                            device=block_table.device)
+                        block_table = torch.cat(
+                            [block_table, block_table_padding], dim=0)
+                else:
+                    num_token_pad_size = graph_pad_size - num_decode_tokens
+                    num_reqs_pad_size = (
+                        graph_pad_size //
+                        common_attn_metadata.decode_token_per_req - num_reqs)
+                    num_block_table_pad_size = (
+                        graph_pad_size //
+                        common_attn_metadata.decode_token_per_req -
+                        num_decodes)
+                    seq_lens_list = seq_lens.tolist() + [0] * num_reqs_pad_size
+                    slot_padding = torch.full((num_token_pad_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=slot_mapping.dtype,
+                                              device=slot_mapping.device)
+                    slot_mapping = torch.cat([slot_mapping, slot_padding])
+                    block_table_padding = torch.zeros(
+                        (num_block_table_pad_size, ) + block_table.shape[1:],
+                        dtype=block_table.dtype,
+                        device=block_table.device)
+                    block_table = torch.cat([block_table, block_table_padding],
+                                            dim=0)
+                    position_padding = torch.zeros(
+                        num_token_pad_size,
+                        dtype=input_positions.dtype,
+                        device=input_positions.device)
+                    input_positions = torch.cat(
+                        [input_positions, position_padding])
+                    actual_seq_lengths_q = self.pad_actual_seq_len_q_mtp_enable_pad(
+                        num_reqs_pad_size, num_reqs, actual_seq_lengths_q,
+                        common_attn_metadata)
+
            # TODO: After the fullgraph supports MTP, the if branch needs to deleted
            assert self.cos_cache is not None
            assert self.sin_cache is not None
@@ -1267,8 +1396,11 @@ class AscendMLAImpl(MLAAttentionImpl):
            "actual_seq_lengths": actual_seq_lengths,
            "actual_seq_lengths_kv": decode_meta.seq_lens_list,
        }
-        graph_params = get_graph_params()
        forward_context: ForwardContext = get_forward_context()
+        if forward_context.is_mtp_model:
+            graph_params = get_mtp_graph_params()
+        else:
+            graph_params = get_graph_params()
        if forward_context.capturing:
            stream = torch_npu.npu.current_stream()