Revert "[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#5519)"(#5902)

This reverts commit d886b81971. it breaks pd function

- vLLM version: v0.13.0
- vLLM main:
bde38c11df

Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
This commit is contained in:
zhaomingyu13
2026-01-14 20:55:10 +08:00
committed by GitHub
parent 2a6d95c389
commit 01805fbd7d
6 changed files with 11 additions and 61 deletions

View File

@@ -23,7 +23,6 @@
from __future__ import annotations
import os
from typing import Union
import pytest
from vllm import SamplingParams
@@ -124,11 +123,11 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int,
@pytest.mark.parametrize("method", ["eagle", "eagle3"])
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
@pytest.mark.parametrize("async_scheduling", [True, False])
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
def test_llama_qwen3_eagle_correctness(
model_name: str, model_name_main: str, num_speculative_tokens: int,
method: str, disable_padded_drafter_batch: bool,
async_scheduling: bool, draft_tensor_parallel_size: Union[None, int]):
def test_llama_qwen3_eagle_correctness(model_name: str, model_name_main: str,
num_speculative_tokens: int,
method: str,
disable_padded_drafter_batch: bool,
async_scheduling: bool):
example_prompts = [
"Hello, my name is",
@@ -163,8 +162,6 @@ def test_llama_qwen3_eagle_correctness(
"method": method,
"model": model_name,
"num_speculative_tokens": num_speculative_tokens,
"draft_tensor_parallel_size":
draft_tensor_parallel_size,
"max_model_len": 128,
"draft_vocab_size": 128256,
},

View File

@@ -4,7 +4,7 @@ from __future__ import annotations
import math
import os
import random
from typing import Any, Union
from typing import Any
import pytest
from transformers import AutoTokenizer
@@ -217,11 +217,9 @@ def test_suffix_acceptance(
@pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"])
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
def test_eagle_logprobs(
model_name: str,
use_eagle3: bool,
draft_tensor_parallel_size: Union[None, int],
):
prompt = {"role": "user", "content": "Hello world " * 10}
sampling_params = SamplingParams(temperature=0,
@@ -248,7 +246,6 @@ def test_eagle_logprobs(
"method": "eagle3" if use_eagle3 else "eagle",
"model": spec_model_name,
"num_speculative_tokens": 2,
"draft_tensor_parallel_size": draft_tensor_parallel_size,
"max_model_len": 128,
},
max_model_len=128,
@@ -274,13 +271,11 @@ def test_eagle_logprobs(
@pytest.mark.parametrize("method", MODELS.keys())
@pytest.mark.parametrize("num_speculative_tokens", [3])
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
@pytest.mark.parametrize("async_scheduling", [True, False])
def test_llama_qwen_eagle_acceptance(
method: str,
num_speculative_tokens: int,
draft_tensor_parallel_size: Union[None, int],
disable_padded_drafter_batch: bool,
async_scheduling: bool,
):
@@ -331,7 +326,6 @@ def test_llama_qwen_eagle_acceptance(
speculative_config = {
"method": method,
"num_speculative_tokens": num_speculative_tokens,
"draft_tensor_parallel_size": draft_tensor_parallel_size,
"disable_padded_drafter_batch": disable_padded_drafter_batch,
"model": spec_model_name,
}