[Test] Add acceptance test for eagle/eagle3 (#5366)

### What this PR does / why we need it?
This PR aims to add acceptance test for eagle/eagle3 via llama/qwen. We
obtained golden baselines by running several times (based on healthy
main), which is feasible and convincing.

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
by ci

- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08

---------

Signed-off-by: Zetong Li <slippersss@126.com>
This commit is contained in:
Zetong Li
2025-12-27 08:50:01 +08:00
committed by GitHub
parent 8ed6f98a5a
commit 16ef2474bf

View File

@@ -9,11 +9,31 @@ from typing import Any
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig
from vllm.v1.metrics.reader import Counter, Vector
from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
MODELS = {
"eagle": {
"main": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
"spec": "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B",
},
"eagle3": {
"main": "Qwen/Qwen3-8B",
"spec": "RedHatAI/Qwen3-8B-speculator.eagle3",
},
}
# NOTE: golden may change (eagle_proposer only runs in eager mode currently),
# thus please update it if ci fails but you have better acceptance
BASELINES = {
"eagle": [0.74, 0.44, 0.29],
"eagle3": [0.68, 0.40, 0.18],
}
@pytest.fixture @pytest.fixture
def test_prompts(): def test_prompts():
@@ -324,3 +344,106 @@ def test_eagle_logprobs(
abs_tol=1e-1) abs_tol=1e-1)
assert ref_logprob.rank == spec_logprob.rank assert ref_logprob.rank == spec_logprob.rank
assert ref_logprob.decoded_token == spec_logprob.decoded_token assert ref_logprob.decoded_token == spec_logprob.decoded_token
@pytest.mark.parametrize("method", MODELS.keys())
@pytest.mark.parametrize("num_speculative_tokens", [3])
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
@pytest.mark.parametrize("async_scheduling", [True, False])
def test_llama_qwen_eagle_acceptance(
method: str,
num_speculative_tokens: int,
disable_padded_drafter_batch: bool,
async_scheduling: bool,
):
if disable_padded_drafter_batch and async_scheduling:
pytest.skip(
"skip disable_padded_drafter_batch=True and async_scheduling=True",
)
main_model_name = MODELS[method]["main"]
spec_model_name = MODELS[method]["spec"]
tokenizer = AutoTokenizer.from_pretrained(
main_model_name,
trust_remote_code=True,
)
sampling_params = SamplingParams(
temperature=0,
ignore_eos=False,
max_tokens=256,
)
prompts = [
{
"role": "user",
"content": "Hello, my name is",
},
{
"role": "user",
"content": "The president of the United States is",
},
{
"role": "user",
"content": "The capital of France is",
},
{
"role": "user",
"content": "The future of AI is",
},
]
prompts = [
tokenizer.apply_chat_template(
[prompt],
tokenize=False,
add_generation_prompt=True,
) for prompt in prompts
]
speculative_config = {
"method": method,
"num_speculative_tokens": num_speculative_tokens,
"disable_padded_drafter_batch": disable_padded_drafter_batch,
"model": spec_model_name,
}
compilation_config = CompilationConfig(cudagraph_capture_sizes=[12])
with VllmRunner(
main_model_name,
max_model_len=2048,
disable_log_stats=False,
tensor_parallel_size=1,
max_num_seqs=256,
distributed_executor_backend="mp",
gpu_memory_utilization=0.7,
speculative_config=speculative_config,
compilation_config=compilation_config,
async_scheduling=async_scheduling,
) as llm:
_ = llm.generate(prompts, sampling_params)
metrics = llm.model.get_metrics()
num_drafts = 0
num_accepted_tokens_per_pos = [0] * num_speculative_tokens
for metric in metrics:
if metric.name == "vllm:spec_decode_num_drafts":
assert isinstance(metric, Counter)
num_drafts += metric.value
elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
assert isinstance(metric, Vector)
for pos in range(len(metric.values)):
num_accepted_tokens_per_pos[pos] += metric.values[pos]
acceptance_per_pos = [
num_accepted_tokens / num_drafts
for num_accepted_tokens in num_accepted_tokens_per_pos
]
golden = BASELINES[method]
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
if not match:
print(f"acceptance_per_pos: {acceptance_per_pos}")
print(f"golden: {golden}")
assert match