[Feature] Support to use fullgraph with eagle (#5118)

### What this PR does / why we need it? We support to use full graph with eagle. Change list: 1. Distinguish between processing graph_params and draft_graph_params in attention_v1. 2. Adapt the full-graph mode in eagle_proposer, include: 1). If use full graph, make Fullgraph Wrapper when load model. 2). Build a new meatadata, set running mode in FULL and mark attention update in dummy_run when in Fullgraph mode. 3). Fixed and fill any attn_metadata, such as attn_metadata.slot_mapping. 4). Add a descriptor. 5). Set running mode and triggered update metadata. 3. Trans is_mtp_model to is_draft_model, and add the update of workspace. NOTE: When set async_scheduling=True, the draft model will enforce execution in eager mode. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: anon189Ty <Stari_Falcon@outlook.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com>
2025-12-29 09:54:51 +08:00
parent f81cf694b2
commit 3e67e8276c
11 changed files with 348 additions and 103 deletions
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -254,7 +254,10 @@ def _update_attn_pa_params(update_stream, forward_context, runtime_shape):


 def _update_attn_fia_params(update_stream, forward_context, runtime_shape):
-    graph_params = get_graph_params()
+    if forward_context.is_draft_model:
+        graph_params = get_draft_graph_params()
+    else:
+        graph_params = get_graph_params()
    # For Qwen3-next, since the kv_cache_config has already categorized
    # linear_attn and self_attn, the attn_metadata is first arranged with
    # self_attn followed by linear_attn. Therefore, using zip directly
@@ -306,8 +309,8 @@ def update_attn_params(update_stream, forward_context, runtime_shape,

 def update_mla_attn_params(update_stream, forward_context, runtime_shape,
                           speculative_config):
-    if forward_context.is_mtp_model:
-        graph_params = get_mtp_graph_params()
+    if forward_context.is_draft_model:
+        graph_params = get_draft_graph_params()
    else:
        graph_params = get_graph_params()
    # FIXME: Behold! We are using a temporary hack here to update the args
@@ -326,7 +329,7 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
            seq_lens_list = forward_context.attn_metadata[
                key].decode.seq_lens_list
            if speculative_config and speculative_config.method == "mtp" \
-                    and not forward_context.is_mtp_model:
+                    and not forward_context.is_draft_model:
                actual_seq_lengths = forward_context.attn_metadata[
                    key].decode.actual_seq_lengths_q
                spec_multiple = speculative_config.num_speculative_tokens + 1
@@ -336,7 +339,7 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
                    spec_multiple * (i + 1)
                    for i in range(runtime_shape // spec_multiple)
                ]
-            elif forward_context.is_mtp_model:
+            elif forward_context.is_draft_model:
                actual_seq_lengths = forward_context.attn_metadata[
                    key].decode.actual_seq_lengths_q
                block_table = forward_context.attn_metadata[
@@ -440,8 +443,8 @@ def update_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape):

 def update_mla_attn_dcp_pcp_params(update_stream, forward_context,
                                   runtime_shape):
-    if forward_context.is_mtp_model:
-        graph_params = get_mtp_graph_params()
+    if forward_context.is_draft_model:
+        graph_params = get_draft_graph_params()
    else:
        graph_params = get_graph_params()
    # FIXME: Behold! We are using a temporary hack here to update the args
@@ -527,14 +530,14 @@ def get_graph_params():
    return _graph_params


-_mtp_graph_params: Optional[GraphParams] = None
+_draft_graph_params: Optional[GraphParams] = None


-def set_mtp_graph_params(aclgraph_capture_sizes: list[int]):
-    global _mtp_graph_params
-    if _mtp_graph_params is not None:
-        raise ValueError("MTPGraph parameters have already been set!")
-    _mtp_graph_params = GraphParams(
+def set_draft_graph_params(aclgraph_capture_sizes: list[int]):
+    global _draft_graph_params
+    if _draft_graph_params is not None:
+        raise ValueError("DraftGraph parameters have already been set!")
+    _draft_graph_params = GraphParams(
        {size: []
         for size in aclgraph_capture_sizes},
        {size: None
@@ -546,11 +549,11 @@ def set_mtp_graph_params(aclgraph_capture_sizes: list[int]):
    )


-def update_mtp_graph_params_workspaces(num_tokens: int, workspace: Any):
-    global _mtp_graph_params
-    if _mtp_graph_params is not None:
-        _mtp_graph_params.workspaces[num_tokens] = workspace
+def update_draft_graph_params_workspaces(num_tokens: int, workspace: Any):
+    global _draft_graph_params
+    if _draft_graph_params is not None:
+        _draft_graph_params.workspaces[num_tokens] = workspace


-def get_mtp_graph_params():
-    return _mtp_graph_params
+def get_draft_graph_params():
+    return _draft_graph_params