[Test] Add acceptance test for eagle/eagle3 (#5366)
### What this PR does / why we need it?
This PR aims to add acceptance test for eagle/eagle3 via llama/qwen. We
obtained golden baselines by running several times (based on healthy
main), which is feasible and convincing.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
by ci
- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08
---------
Signed-off-by: Zetong Li <slippersss@126.com>
This commit is contained in:
@@ -9,11 +9,31 @@ from typing import Any
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.config import CompilationConfig
|
||||||
|
from vllm.v1.metrics.reader import Counter, Vector
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory
|
from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory
|
||||||
|
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|
||||||
|
MODELS = {
|
||||||
|
"eagle": {
|
||||||
|
"main": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"spec": "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B",
|
||||||
|
},
|
||||||
|
"eagle3": {
|
||||||
|
"main": "Qwen/Qwen3-8B",
|
||||||
|
"spec": "RedHatAI/Qwen3-8B-speculator.eagle3",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# NOTE: golden may change (eagle_proposer only runs in eager mode currently),
|
||||||
|
# thus please update it if ci fails but you have better acceptance
|
||||||
|
BASELINES = {
|
||||||
|
"eagle": [0.74, 0.44, 0.29],
|
||||||
|
"eagle3": [0.68, 0.40, 0.18],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_prompts():
|
def test_prompts():
|
||||||
@@ -324,3 +344,106 @@ def test_eagle_logprobs(
|
|||||||
abs_tol=1e-1)
|
abs_tol=1e-1)
|
||||||
assert ref_logprob.rank == spec_logprob.rank
|
assert ref_logprob.rank == spec_logprob.rank
|
||||||
assert ref_logprob.decoded_token == spec_logprob.decoded_token
|
assert ref_logprob.decoded_token == spec_logprob.decoded_token
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("method", MODELS.keys())
|
||||||
|
@pytest.mark.parametrize("num_speculative_tokens", [3])
|
||||||
|
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
||||||
|
@pytest.mark.parametrize("async_scheduling", [True, False])
|
||||||
|
def test_llama_qwen_eagle_acceptance(
|
||||||
|
method: str,
|
||||||
|
num_speculative_tokens: int,
|
||||||
|
disable_padded_drafter_batch: bool,
|
||||||
|
async_scheduling: bool,
|
||||||
|
):
|
||||||
|
if disable_padded_drafter_batch and async_scheduling:
|
||||||
|
pytest.skip(
|
||||||
|
"skip disable_padded_drafter_batch=True and async_scheduling=True",
|
||||||
|
)
|
||||||
|
|
||||||
|
main_model_name = MODELS[method]["main"]
|
||||||
|
spec_model_name = MODELS[method]["spec"]
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
main_model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
temperature=0,
|
||||||
|
ignore_eos=False,
|
||||||
|
max_tokens=256,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello, my name is",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "The president of the United States is",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "The capital of France is",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "The future of AI is",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
prompts = [
|
||||||
|
tokenizer.apply_chat_template(
|
||||||
|
[prompt],
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
) for prompt in prompts
|
||||||
|
]
|
||||||
|
|
||||||
|
speculative_config = {
|
||||||
|
"method": method,
|
||||||
|
"num_speculative_tokens": num_speculative_tokens,
|
||||||
|
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
||||||
|
"model": spec_model_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
compilation_config = CompilationConfig(cudagraph_capture_sizes=[12])
|
||||||
|
|
||||||
|
with VllmRunner(
|
||||||
|
main_model_name,
|
||||||
|
max_model_len=2048,
|
||||||
|
disable_log_stats=False,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
max_num_seqs=256,
|
||||||
|
distributed_executor_backend="mp",
|
||||||
|
gpu_memory_utilization=0.7,
|
||||||
|
speculative_config=speculative_config,
|
||||||
|
compilation_config=compilation_config,
|
||||||
|
async_scheduling=async_scheduling,
|
||||||
|
) as llm:
|
||||||
|
_ = llm.generate(prompts, sampling_params)
|
||||||
|
metrics = llm.model.get_metrics()
|
||||||
|
|
||||||
|
num_drafts = 0
|
||||||
|
num_accepted_tokens_per_pos = [0] * num_speculative_tokens
|
||||||
|
for metric in metrics:
|
||||||
|
if metric.name == "vllm:spec_decode_num_drafts":
|
||||||
|
assert isinstance(metric, Counter)
|
||||||
|
num_drafts += metric.value
|
||||||
|
elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
|
||||||
|
assert isinstance(metric, Vector)
|
||||||
|
for pos in range(len(metric.values)):
|
||||||
|
num_accepted_tokens_per_pos[pos] += metric.values[pos]
|
||||||
|
|
||||||
|
acceptance_per_pos = [
|
||||||
|
num_accepted_tokens / num_drafts
|
||||||
|
for num_accepted_tokens in num_accepted_tokens_per_pos
|
||||||
|
]
|
||||||
|
golden = BASELINES[method]
|
||||||
|
|
||||||
|
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
|
||||||
|
if not match:
|
||||||
|
print(f"acceptance_per_pos: {acceptance_per_pos}")
|
||||||
|
print(f"golden: {golden}")
|
||||||
|
|
||||||
|
assert match
|
||||||
|
|||||||
Reference in New Issue
Block a user