Eagle3 mm support, enablement on qwen3vl (#4848)

### What this PR does / why we need it? follow pr [https://github.com/vllm-project/vllm/pull/20788](https://github.com/vllm-project/vllm/pull/20788) , Eagle3 mm support, enablement on qwen3vl target model [Qwen/Qwen3-VL-8B-Instruct]([https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct]) eagle3 [MNN/Qwen3-VL-8B-Instruct-Eagle3](https://www.modelscope.cn/models/MNN/Qwen3-VL-8B-Instruct-Eagle3) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? pytest ./tests/e2e/singlecard/test_completion_with_prompt_embeds.py -vv vLLM with eagle3 : ```bash vllm serve /model/Qwen3-VL-8B-Instruct --enforce-eager --port 9100 --max-model-len 32768 --max-num-seqs 32 --tensor-parallel-size 2 --allowed-local-media-path /model/gx/images --speculative-config '{ "method": "eagle3", "model": "/model/hf/Qwen3-VL-8B-Instruct-Eagle3", "num_speculative_tokens": 3 }' ``` vLLM without eagle3 : ```bash vllm serve /model/Qwen3-VL-8B-Instruct --enforce-eager --port 9100 --max-model-len 32768 --max-num-seqs 32 --tensor-parallel-size 2 --allowed-local-media-path /model/gx/images ``` bench: ``` vllm bench serve --backend openai-chat --base-url http://127.0.0.1:9100 --tokenizer /model/Qwen3-VL-8B-Instruct --endpoint /v1/chat/completions --model /model/Qwen3-VL-8B-Instruct --dataset-name random --num-prompts 50 --max-concurrency 5 --temperature 0 --top-p 1.0 --seed 123 ``` - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: jesse <szxfml@gmail.com>
2026-01-19 08:58:07 +08:00
parent 05e69b99e5
commit 2b6dc100b5
5 changed files with 145 additions and 29 deletions
--- a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
@@ -85,6 +85,14 @@ def eagle3_model_name():
    return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"


+@pytest.fixture
+def vl_model_name():
+    return "Qwen/Qwen3-VL-8B-Instruct"
+
+def vl_eagle3_model_name():
+    return "MNN/Qwen3-VL-8B-Instruct-Eagle3"
+
+
 def test_ngram_correctness(
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,
@@ -129,6 +137,48 @@ def test_ngram_correctness(
    assert matches > int(0.66 * len(ref_outputs))


+def test_qwen3_vl_eagle_correctness(
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    vl_model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    with VllmRunner(
+            vl_model_name,
+            max_model_len=1024,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+    ) as ref_llm:
+        ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
+
+    spec_model_name = vl_eagle3_model_name()
+    with VllmRunner(
+            vl_model_name,
+            speculative_config={
+                "method": "eagle3",
+                "model": spec_model_name,
+                "num_speculative_tokens": 2,
+            },
+            max_model_len=1024,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+    ) as runner:
+        spec_outputs = runner.model.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
+
+    # Heuristic: expect at least 70% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
+
 def test_suffix_correctness(
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,