[Test] Add acceptance test for eagle/eagle3 (#5366)

### What this PR does / why we need it? This PR aims to add acceptance test for eagle/eagle3 via llama/qwen. We obtained golden baselines by running several times (based on healthy main), which is feasible and convincing. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM version: release/v0.13.0 - vLLM main: bc0a5a0c08 --------- Signed-off-by: Zetong Li <slippersss@126.com>
2025-12-27 08:50:01 +08:00
parent 8ed6f98a5a
commit 16ef2474bf
1 changed files with 123 additions and 0 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -9,11 +9,31 @@ from typing import Any
 import pytest
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig
+from vllm.v1.metrics.reader import Counter, Vector

 from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory

 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

+MODELS = {
+    "eagle": {
+        "main": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
+        "spec": "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B",
+    },
+    "eagle3": {
+        "main": "Qwen/Qwen3-8B",
+        "spec": "RedHatAI/Qwen3-8B-speculator.eagle3",
+    },
+}
+
+# NOTE: golden may change (eagle_proposer only runs in eager mode currently),
+# thus please update it if ci fails but you have better acceptance
+BASELINES = {
+    "eagle": [0.74, 0.44, 0.29],
+    "eagle3": [0.68, 0.40, 0.18],
+}
+

@pytest.fixture
 def test_prompts():
@@ -324,3 +344,106 @@ def test_eagle_logprobs(
                            abs_tol=1e-1)
        assert ref_logprob.rank == spec_logprob.rank
        assert ref_logprob.decoded_token == spec_logprob.decoded_token
+
+
+@pytest.mark.parametrize("method", MODELS.keys())
+@pytest.mark.parametrize("num_speculative_tokens", [3])
+@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
+@pytest.mark.parametrize("async_scheduling", [True, False])
+def test_llama_qwen_eagle_acceptance(
+    method: str,
+    num_speculative_tokens: int,
+    disable_padded_drafter_batch: bool,
+    async_scheduling: bool,
+):
+    if disable_padded_drafter_batch and async_scheduling:
+        pytest.skip(
+            "skip disable_padded_drafter_batch=True and async_scheduling=True",
+        )
+
+    main_model_name = MODELS[method]["main"]
+    spec_model_name = MODELS[method]["spec"]
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        main_model_name,
+        trust_remote_code=True,
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        ignore_eos=False,
+        max_tokens=256,
+    )
+
+    prompts = [
+        {
+            "role": "user",
+            "content": "Hello, my name is",
+        },
+        {
+            "role": "user",
+            "content": "The president of the United States is",
+        },
+        {
+            "role": "user",
+            "content": "The capital of France is",
+        },
+        {
+            "role": "user",
+            "content": "The future of AI is",
+        },
+    ]
+    prompts = [
+        tokenizer.apply_chat_template(
+            [prompt],
+            tokenize=False,
+            add_generation_prompt=True,
+        ) for prompt in prompts
+    ]
+
+    speculative_config = {
+        "method": method,
+        "num_speculative_tokens": num_speculative_tokens,
+        "disable_padded_drafter_batch": disable_padded_drafter_batch,
+        "model": spec_model_name,
+    }
+
+    compilation_config = CompilationConfig(cudagraph_capture_sizes=[12])
+
+    with VllmRunner(
+            main_model_name,
+            max_model_len=2048,
+            disable_log_stats=False,
+            tensor_parallel_size=1,
+            max_num_seqs=256,
+            distributed_executor_backend="mp",
+            gpu_memory_utilization=0.7,
+            speculative_config=speculative_config,
+            compilation_config=compilation_config,
+            async_scheduling=async_scheduling,
+    ) as llm:
+        _ = llm.generate(prompts, sampling_params)
+        metrics = llm.model.get_metrics()
+
+    num_drafts = 0
+    num_accepted_tokens_per_pos = [0] * num_speculative_tokens
+    for metric in metrics:
+        if metric.name == "vllm:spec_decode_num_drafts":
+            assert isinstance(metric, Counter)
+            num_drafts += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+            assert isinstance(metric, Vector)
+            for pos in range(len(metric.values)):
+                num_accepted_tokens_per_pos[pos] += metric.values[pos]
+
+    acceptance_per_pos = [
+        num_accepted_tokens / num_drafts
+        for num_accepted_tokens in num_accepted_tokens_per_pos
+    ]
+    golden = BASELINES[method]
+
+    match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
+    if not match:
+        print(f"acceptance_per_pos: {acceptance_per_pos}")
+        print(f"golden: {golden}")
+
+    assert match