diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index 01e56371..5d3f829d 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -9,11 +9,31 @@ from typing import Any import pytest from transformers import AutoTokenizer from vllm import LLM, SamplingParams +from vllm.config import CompilationConfig +from vllm.v1.metrics.reader import Counter, Vector from tests.e2e.conftest import VllmRunner, cleanup_dist_env_and_memory os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +MODELS = { + "eagle": { + "main": "LLM-Research/Meta-Llama-3.1-8B-Instruct", + "spec": "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B", + }, + "eagle3": { + "main": "Qwen/Qwen3-8B", + "spec": "RedHatAI/Qwen3-8B-speculator.eagle3", + }, +} + +# NOTE: golden may change (eagle_proposer only runs in eager mode currently), +# thus please update it if ci fails but you have better acceptance +BASELINES = { + "eagle": [0.74, 0.44, 0.29], + "eagle3": [0.68, 0.40, 0.18], +} + @pytest.fixture def test_prompts(): @@ -324,3 +344,106 @@ def test_eagle_logprobs( abs_tol=1e-1) assert ref_logprob.rank == spec_logprob.rank assert ref_logprob.decoded_token == spec_logprob.decoded_token + + +@pytest.mark.parametrize("method", MODELS.keys()) +@pytest.mark.parametrize("num_speculative_tokens", [3]) +@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False]) +@pytest.mark.parametrize("async_scheduling", [True, False]) +def test_llama_qwen_eagle_acceptance( + method: str, + num_speculative_tokens: int, + disable_padded_drafter_batch: bool, + async_scheduling: bool, +): + if disable_padded_drafter_batch and async_scheduling: + pytest.skip( + "skip disable_padded_drafter_batch=True and async_scheduling=True", + ) + + main_model_name = MODELS[method]["main"] + spec_model_name = MODELS[method]["spec"] + + tokenizer = AutoTokenizer.from_pretrained( + main_model_name, + trust_remote_code=True, + ) + sampling_params = SamplingParams( + temperature=0, + ignore_eos=False, + max_tokens=256, + ) + + prompts = [ + { + "role": "user", + "content": "Hello, my name is", + }, + { + "role": "user", + "content": "The president of the United States is", + }, + { + "role": "user", + "content": "The capital of France is", + }, + { + "role": "user", + "content": "The future of AI is", + }, + ] + prompts = [ + tokenizer.apply_chat_template( + [prompt], + tokenize=False, + add_generation_prompt=True, + ) for prompt in prompts + ] + + speculative_config = { + "method": method, + "num_speculative_tokens": num_speculative_tokens, + "disable_padded_drafter_batch": disable_padded_drafter_batch, + "model": spec_model_name, + } + + compilation_config = CompilationConfig(cudagraph_capture_sizes=[12]) + + with VllmRunner( + main_model_name, + max_model_len=2048, + disable_log_stats=False, + tensor_parallel_size=1, + max_num_seqs=256, + distributed_executor_backend="mp", + gpu_memory_utilization=0.7, + speculative_config=speculative_config, + compilation_config=compilation_config, + async_scheduling=async_scheduling, + ) as llm: + _ = llm.generate(prompts, sampling_params) + metrics = llm.model.get_metrics() + + num_drafts = 0 + num_accepted_tokens_per_pos = [0] * num_speculative_tokens + for metric in metrics: + if metric.name == "vllm:spec_decode_num_drafts": + assert isinstance(metric, Counter) + num_drafts += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + assert isinstance(metric, Vector) + for pos in range(len(metric.values)): + num_accepted_tokens_per_pos[pos] += metric.values[pos] + + acceptance_per_pos = [ + num_accepted_tokens / num_drafts + for num_accepted_tokens in num_accepted_tokens_per_pos + ] + golden = BASELINES[method] + + match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden)) + if not match: + print(f"acceptance_per_pos: {acceptance_per_pos}") + print(f"golden: {golden}") + + assert match