[Feature] implement eagle spec decoding for model runner v2 (#5840)
### What this PR does / why we need it? this pr implement eagle spec decoding for model runner v2, please see RFC https://github.com/vllm-project/vllm-ascend/issues/5208 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? vLLM version: v0.13.0 --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
This commit is contained in:
@@ -25,6 +25,9 @@ from tests.e2e.conftest import VllmRunner
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||
|
||||
MAIN_MODELS = ["LLM-Research/Meta-Llama-3.1-8B-Instruct"]
|
||||
EGALE_MODELS = ["vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@@ -49,3 +52,36 @@ def test_qwen3_dense_eager_mode(
|
||||
enforce_eager=enforce_eager,
|
||||
) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MAIN_MODELS)
|
||||
@pytest.mark.parametrize("eagle_model", EGALE_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})
|
||||
def test_egale_spec_decoding(
|
||||
model: str,
|
||||
eagle_model: str,
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
) -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=enforce_eager,
|
||||
async_scheduling=True,
|
||||
speculative_config={
|
||||
"model": eagle_model,
|
||||
"method": "eagle",
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
Reference in New Issue
Block a user