[Feature] Support to use fullgraph with eagle (#5118)

### What this PR does / why we need it? We support to use full graph with eagle. Change list: 1. Distinguish between processing graph_params and draft_graph_params in attention_v1. 2. Adapt the full-graph mode in eagle_proposer, include: 1). If use full graph, make Fullgraph Wrapper when load model. 2). Build a new meatadata, set running mode in FULL and mark attention update in dummy_run when in Fullgraph mode. 3). Fixed and fill any attn_metadata, such as attn_metadata.slot_mapping. 4). Add a descriptor. 5). Set running mode and triggered update metadata. 3. Trans is_mtp_model to is_draft_model, and add the update of workspace. NOTE: When set async_scheduling=True, the draft model will enforce execution in eager mode. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: anon189Ty <Stari_Falcon@outlook.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com>
2025-12-29 09:54:51 +08:00
parent f81cf694b2
commit 3e67e8276c
11 changed files with 348 additions and 103 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -206,6 +206,51 @@ def test_eagle_correctness(
    del llm


+@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
+def test_eaqgle_fullgraph_correctness(
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+    use_eagle3: bool,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle3 speculative decoding
+    in full-graph mode.
+    '''
+    spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
+    with VllmRunner(model_name, max_model_len=1024) as ref_llm:
+        ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
+
+    with VllmRunner(model_name,
+                    speculative_config={
+                        "method": "eagle3" if use_eagle3 else "eagle",
+                        "model": spec_model_name,
+                        "num_speculative_tokens": 4,
+                    },
+                    compilation_config={
+                        "level": 3,
+                        "cudagraph_mode": "FULL_DECODE_ONLY",
+                        "cudagraph_num_of_warmups": 1,
+                        "cudagraph_capture_sizes": [5, 10, 15, 20],
+                    },
+                    max_model_len=1024) as runner:
+        spec_outputs = runner.model.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
+
+    # Heuristic: expect at least 70% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
+
+
 def test_suffix_correctness(
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,