[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)

According to the official documentation, the parameter "draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3 model. However, based on actual debugging, it was found that the number of tensor parallelisms (tp) of the Eagle model is consistent with that of the target model. The setting of tp for the draft model did not take effect as expected. **Note:** This feature has not been superimposed and tested with `sp` and `dp`. It will be adapted later No ```python from vllm import LLM, SamplingParams def main(): prompts = [ "The future of AI is", ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", tensor_parallel_size=4, gpu_memory_utilization=0.9, enforce_eager=True, speculative_config={ "method": "eagle3", "model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" "draft_tensor_parallel_size": 1, "num_speculative_tokens": 3, }, ) outputs = llm.generate(prompts, sampling_params) print(f"Outputs: {outputs}") for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` Fixes vllm-project/vllm#31345 ### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: d68209402d Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com> Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
parent 37a9cf818a
commit 34fb628248
6 changed files with 61 additions and 11 deletions
--- a/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
+++ b/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
@@ -23,6 +23,7 @@
 from __future__ import annotations

 import os
+from typing import Union

 import pytest
 from vllm import SamplingParams
@@ -124,11 +125,11 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int,
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
@pytest.mark.parametrize("async_scheduling", [True, False])
-def test_llama_qwen3_eagle_correctness(model_name: str, model_name_main: str,
-                                       num_speculative_tokens: int,
-                                       method: str,
-                                       disable_padded_drafter_batch: bool,
-                                       async_scheduling: bool):
+@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
+def test_llama_qwen3_eagle_correctness(
+        model_name: str, model_name_main: str, num_speculative_tokens: int,
+        method: str, disable_padded_drafter_batch: bool,
+        async_scheduling: bool, draft_tensor_parallel_size: Union[None, int]):

    example_prompts = [
        "Hello, my name is",
@@ -163,6 +164,8 @@ def test_llama_qwen3_eagle_correctness(model_name: str, model_name_main: str,
                        "method": method,
                        "model": model_name,
                        "num_speculative_tokens": num_speculative_tokens,
+                        "draft_tensor_parallel_size":
+                        draft_tensor_parallel_size,
                        "max_model_len": 128,
                        "draft_vocab_size": 128256,
                    },
--- a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
@@ -4,7 +4,7 @@ from __future__ import annotations
 import math
 import os
 import random
-from typing import Any
+from typing import Any, Union

 import pytest
 from transformers import AutoTokenizer
@@ -267,9 +267,11 @@ def test_suffix_acceptance(


@pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"])
+@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
 def test_eagle_logprobs(
    model_name: str,
    use_eagle3: bool,
+    draft_tensor_parallel_size: Union[None, int],
 ):
    prompt = {"role": "user", "content": "Hello world " * 10}
    sampling_params = SamplingParams(temperature=0,
@@ -296,6 +298,7 @@ def test_eagle_logprobs(
                "method": "eagle3" if use_eagle3 else "eagle",
                "model": spec_model_name,
                "num_speculative_tokens": 2,
+                "draft_tensor_parallel_size": draft_tensor_parallel_size,
                "max_model_len": 128,
            },
            max_model_len=128,
@@ -321,11 +324,13 @@ def test_eagle_logprobs(

@pytest.mark.parametrize("method", MODELS.keys())
@pytest.mark.parametrize("num_speculative_tokens", [3])
+@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
@pytest.mark.parametrize("async_scheduling", [True, False])
 def test_llama_qwen_eagle_acceptance(
    method: str,
    num_speculative_tokens: int,
+    draft_tensor_parallel_size: Union[None, int],
    disable_padded_drafter_batch: bool,
    async_scheduling: bool,
 ):
@@ -376,6 +381,7 @@ def test_llama_qwen_eagle_acceptance(
    speculative_config = {
        "method": method,
        "num_speculative_tokens": num_speculative_tokens,
+        "draft_tensor_parallel_size": draft_tensor_parallel_size,
        "disable_padded_drafter_batch": disable_padded_drafter_batch,
        "model": spec_model_name,
    }