<!-- Thanks for sending a pull request! BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html --> ### What this PR does / why we need it? <!-- - Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue. If possible, please consider writing useful notes for better and faster reviews in your PR. - Please clarify why the changes are needed. For instance, the use case and bug description. - Fixes # --> Make spec decode support for V1 Engine - Currently, Ascend does not support the triton kernel. PyTorch is used to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is not as good as Triton. Therefore, ascend c is used to implement the function in the future. - Currently, spec decode supports only the ngram algorithm. The eagle algorithm needs to be further adapted. ### Does this PR introduce _any_ user-facing change? <!-- Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes. Documentation-only updates are not considered user-facing changes. --> Not change user facing. ### How was this patch tested? <!-- CI passed with new added/existing test. If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future. If tests were not added, please describe why they were not added and/or why it was difficult to add. --> test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and `tests/sample/test_rejection_sampler.py`, test base function of rejection sampler and e2e function of spec decode. Signed-off-by: ponix-j <657511300@qq.com>
157 lines
5.0 KiB
Python
157 lines
5.0 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import random
|
|
from typing import Any
|
|
|
|
import pytest
|
|
from vllm import LLM, SamplingParams
|
|
|
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
|
|
|
|
|
@pytest.fixture
|
|
def test_prompts():
|
|
prompt_types = ["repeat", "sentence"]
|
|
num_prompts = 100
|
|
prompts = []
|
|
|
|
random.seed(0)
|
|
random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
|
|
|
|
# Generate a mixed batch of prompts, some of which can be easily
|
|
# predicted by n-gram matching and some which likely cannot.
|
|
for kind in random_prompt_type_choices:
|
|
word_choices = ["test", "temp", "hello", "where"]
|
|
word = random.choice(word_choices)
|
|
if kind == "repeat":
|
|
prompt = f"""
|
|
please repeat the word '{word}' 10 times.
|
|
give no other output than the word at least ten times in a row,
|
|
in lowercase with spaces between each word and without quotes.
|
|
"""
|
|
elif kind == "sentence":
|
|
prompt = f"""
|
|
please give a ten-word sentence that
|
|
uses the word {word} at least once.
|
|
give no other output than that simple sentence without quotes.
|
|
"""
|
|
else:
|
|
raise ValueError(f"Unknown prompt type: {kind}")
|
|
prompts.append([{"role": "user", "content": prompt}])
|
|
|
|
return prompts
|
|
|
|
|
|
@pytest.fixture
|
|
def sampling_config():
|
|
return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
|
|
|
|
|
|
@pytest.fixture
|
|
def model_name():
|
|
return "LLM-Research/Meta-Llama-3.1-8B-Instruct"
|
|
|
|
|
|
def eagle_model_name():
|
|
return "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"
|
|
|
|
|
|
def eagle3_model_name():
|
|
return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"
|
|
|
|
|
|
def test_ngram_correctness(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
test_prompts: list[list[dict[str, Any]]],
|
|
sampling_config: SamplingParams,
|
|
model_name: str,
|
|
):
|
|
'''
|
|
Compare the outputs of a original LLM and a speculative LLM
|
|
should be the same when using ngram speculative decoding.
|
|
'''
|
|
with monkeypatch.context() as m:
|
|
m.setenv("VLLM_USE_V1", "1")
|
|
|
|
ref_llm = LLM(model=model_name, max_model_len=1024)
|
|
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
|
del ref_llm
|
|
|
|
spec_llm = LLM(
|
|
model=model_name,
|
|
speculative_config={
|
|
"method": "ngram",
|
|
"prompt_lookup_max": 5,
|
|
"prompt_lookup_min": 3,
|
|
"num_speculative_tokens": 3,
|
|
},
|
|
max_model_len=1024,
|
|
)
|
|
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
|
matches = 0
|
|
misses = 0
|
|
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
|
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
|
matches += 1
|
|
else:
|
|
misses += 1
|
|
print(f"ref_output: {ref_output.outputs[0].text}")
|
|
print(f"spec_output: {spec_output.outputs[0].text}")
|
|
|
|
# Heuristic: expect at least 70% of the prompts to match exactly
|
|
# Upon failure, inspect the outputs to check for inaccuracy.
|
|
assert matches > int(0.7 * len(ref_outputs))
|
|
del spec_llm
|
|
|
|
|
|
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
|
|
def test_eagle_correctness(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
test_prompts: list[list[dict[str, Any]]],
|
|
sampling_config: SamplingParams,
|
|
model_name: str,
|
|
use_eagle3: bool,
|
|
):
|
|
'''
|
|
Compare the outputs of a original LLM and a speculative LLM
|
|
should be the same when using eagle speculative decoding.
|
|
'''
|
|
pytest.skip("Not current support for the test.")
|
|
with monkeypatch.context() as m:
|
|
m.setenv("VLLM_USE_V1", "1")
|
|
|
|
ref_llm = LLM(model=model_name, max_model_len=2048)
|
|
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
|
del ref_llm
|
|
|
|
spec_model_name = eagle3_model_name(
|
|
) if use_eagle3 else eagle_model_name()
|
|
spec_llm = LLM(
|
|
model=model_name,
|
|
trust_remote_code=True,
|
|
speculative_config={
|
|
"method": "eagle3" if use_eagle3 else "eagle",
|
|
"model": spec_model_name,
|
|
"num_speculative_tokens": 3,
|
|
"max_model_len": 2048,
|
|
},
|
|
max_model_len=2048,
|
|
)
|
|
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
|
matches = 0
|
|
misses = 0
|
|
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
|
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
|
matches += 1
|
|
else:
|
|
misses += 1
|
|
print(f"ref_output: {ref_output.outputs[0].text}")
|
|
print(f"spec_output: {spec_output.outputs[0].text}")
|
|
|
|
# Heuristic: expect at least 66% of the prompts to match exactly
|
|
# Upon failure, inspect the outputs to check for inaccuracy.
|
|
assert matches > int(0.66 * len(ref_outputs))
|
|
del spec_llm
|