[Feature] implement eagle spec decoding for model runner v2 (#5840)

### What this PR does / why we need it? this pr implement eagle spec decoding for model runner v2, please see RFC https://github.com/vllm-project/vllm-ascend/issues/5208 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? vLLM version: v0.13.0 --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2026-01-14 09:18:05 +08:00
parent 0415e694cd
commit e20813f441
9 changed files with 468 additions and 82 deletions
--- a/tests/e2e/singlecard/model_runner_v2/test_basic.py
+++ b/tests/e2e/singlecard/model_runner_v2/test_basic.py
@@ -25,6 +25,9 @@ from tests.e2e.conftest import VllmRunner

 MODELS = ["Qwen/Qwen3-0.6B"]

+MAIN_MODELS = ["LLM-Research/Meta-Llama-3.1-8B-Instruct"]
+EGALE_MODELS = ["vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"]
+

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@@ -49,3 +52,36 @@ def test_qwen3_dense_eager_mode(
            enforce_eager=enforce_eager,
    ) as runner:
        runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", MAIN_MODELS)
+@pytest.mark.parametrize("eagle_model", EGALE_MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("enforce_eager", [True])
+@patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})
+def test_egale_spec_decoding(
+    model: str,
+    eagle_model: str,
+    max_tokens: int,
+    enforce_eager: bool,
+) -> None:
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            enforce_eager=enforce_eager,
+            async_scheduling=True,
+            speculative_config={
+                "model": eagle_model,
+                "method": "eagle",
+                "num_speculative_tokens": 3,
+            },
+    ) as runner:
+        runner.model.generate(prompts, sampling_params)