Revert "[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#5519)"(#5902)

This reverts commit d886b81971. it breaks pd function

- vLLM version: v0.13.0
- vLLM main:
bde38c11df

Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
This commit is contained in:
zhaomingyu13
2026-01-14 20:55:10 +08:00
committed by GitHub
parent 2a6d95c389
commit 01805fbd7d
6 changed files with 11 additions and 61 deletions

View File

@@ -23,7 +23,6 @@
from __future__ import annotations from __future__ import annotations
import os import os
from typing import Union
import pytest import pytest
from vllm import SamplingParams from vllm import SamplingParams
@@ -124,11 +123,11 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int,
@pytest.mark.parametrize("method", ["eagle", "eagle3"]) @pytest.mark.parametrize("method", ["eagle", "eagle3"])
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False]) @pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
@pytest.mark.parametrize("async_scheduling", [True, False]) @pytest.mark.parametrize("async_scheduling", [True, False])
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1]) def test_llama_qwen3_eagle_correctness(model_name: str, model_name_main: str,
def test_llama_qwen3_eagle_correctness( num_speculative_tokens: int,
model_name: str, model_name_main: str, num_speculative_tokens: int, method: str,
method: str, disable_padded_drafter_batch: bool, disable_padded_drafter_batch: bool,
async_scheduling: bool, draft_tensor_parallel_size: Union[None, int]): async_scheduling: bool):
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
@@ -163,8 +162,6 @@ def test_llama_qwen3_eagle_correctness(
"method": method, "method": method,
"model": model_name, "model": model_name,
"num_speculative_tokens": num_speculative_tokens, "num_speculative_tokens": num_speculative_tokens,
"draft_tensor_parallel_size":
draft_tensor_parallel_size,
"max_model_len": 128, "max_model_len": 128,
"draft_vocab_size": 128256, "draft_vocab_size": 128256,
}, },

View File

@@ -4,7 +4,7 @@ from __future__ import annotations
import math import math
import os import os
import random import random
from typing import Any, Union from typing import Any
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
@@ -217,11 +217,9 @@ def test_suffix_acceptance(
@pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"]) @pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"])
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
def test_eagle_logprobs( def test_eagle_logprobs(
model_name: str, model_name: str,
use_eagle3: bool, use_eagle3: bool,
draft_tensor_parallel_size: Union[None, int],
): ):
prompt = {"role": "user", "content": "Hello world " * 10} prompt = {"role": "user", "content": "Hello world " * 10}
sampling_params = SamplingParams(temperature=0, sampling_params = SamplingParams(temperature=0,
@@ -248,7 +246,6 @@ def test_eagle_logprobs(
"method": "eagle3" if use_eagle3 else "eagle", "method": "eagle3" if use_eagle3 else "eagle",
"model": spec_model_name, "model": spec_model_name,
"num_speculative_tokens": 2, "num_speculative_tokens": 2,
"draft_tensor_parallel_size": draft_tensor_parallel_size,
"max_model_len": 128, "max_model_len": 128,
}, },
max_model_len=128, max_model_len=128,
@@ -274,13 +271,11 @@ def test_eagle_logprobs(
@pytest.mark.parametrize("method", MODELS.keys()) @pytest.mark.parametrize("method", MODELS.keys())
@pytest.mark.parametrize("num_speculative_tokens", [3]) @pytest.mark.parametrize("num_speculative_tokens", [3])
@pytest.mark.parametrize("draft_tensor_parallel_size", [None, 1])
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False]) @pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
@pytest.mark.parametrize("async_scheduling", [True, False]) @pytest.mark.parametrize("async_scheduling", [True, False])
def test_llama_qwen_eagle_acceptance( def test_llama_qwen_eagle_acceptance(
method: str, method: str,
num_speculative_tokens: int, num_speculative_tokens: int,
draft_tensor_parallel_size: Union[None, int],
disable_padded_drafter_batch: bool, disable_padded_drafter_batch: bool,
async_scheduling: bool, async_scheduling: bool,
): ):
@@ -331,7 +326,6 @@ def test_llama_qwen_eagle_acceptance(
speculative_config = { speculative_config = {
"method": method, "method": method,
"num_speculative_tokens": num_speculative_tokens, "num_speculative_tokens": num_speculative_tokens,
"draft_tensor_parallel_size": draft_tensor_parallel_size,
"disable_padded_drafter_batch": disable_padded_drafter_batch, "disable_padded_drafter_batch": disable_padded_drafter_batch,
"model": spec_model_name, "model": spec_model_name,
} }

View File

@@ -27,8 +27,6 @@ class TestEagleProposerInitialization(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2) (i + 1) * (0, ) for i in range(2)
@@ -116,8 +114,6 @@ class TestEagleProposerLoadModel(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2) (i + 1) * (0, ) for i in range(2)
@@ -250,8 +246,6 @@ class TestEagleProposerDummyRun(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(4) (i + 1) * (0, ) for i in range(4)
]) ])
@@ -366,8 +360,6 @@ class TestEagleProposerHelperMethods(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2) (i + 1) * (0, ) for i in range(2)

View File

@@ -42,9 +42,6 @@ class TestMtpProposer:
config.model_config.max_model_len = 2048 config.model_config.max_model_len = 2048
config.model_config.uses_mrope = False config.model_config.uses_mrope = False
config.model_config.hf_text_config = None config.model_config.hf_text_config = None
config.model_config.hf_config = None
config.parallel_config.tensor_parallel_size = 1
config.speculative_config.draft_tensor_parallel_size = 1
config.load_config = None config.load_config = None

View File

@@ -130,27 +130,6 @@ class EagleProposer(VllmEagleProposer):
self.use_sparse = hasattr(vllm_config.model_config.hf_text_config, self.use_sparse = hasattr(vllm_config.model_config.hf_text_config,
"index_topk") "index_topk")
# NOTE:
# `draft_tensor_parallel_size` does not take effect for Eagle:
# the draft model uses the same TP size as the target model in practice.
# so we applied this patch to set tp=1 of draft model separately.
# Due to verification of `_verify_and_get_draft_tp` in vllm,
# the value of `draft_tensor_parallel_size` here will either be 1 separately
# or the same as target model.
# TODO(zhaomingyu13): If we want to adapt to the case where draft model tp
# is not 1 and differs from target model, this part should be rewritten.
if (vllm_config.parallel_config.tensor_parallel_size
!= self.speculative_config.draft_tensor_parallel_size):
tp_group = init_model_parallel_group(
[[get_world_group().rank]],
get_world_group().rank,
torch.distributed.get_backend(get_world_group().device_group),
use_message_queue_broadcaster=True,
group_name="tp",
)
self.tp_group_context = patch_tensor_parallel_group(tp_group)
else:
self.tp_group_context = nullcontext()
# TODO: Remove it when the bug of fx-graph is solved # TODO: Remove it when the bug of fx-graph is solved
self.maybe_eager_context: ContextManager[Any] = nullcontext() self.maybe_eager_context: ContextManager[Any] = nullcontext()

View File

@@ -165,10 +165,6 @@ def graph_capture(device: torch.device):
yield graph_capture_context yield graph_capture_context
def get_tp_context(drafter):
return getattr(drafter, "tp_group_context", nullcontext())
class ExecuteModelState(NamedTuple): class ExecuteModelState(NamedTuple):
"""Ephemeral cached state transferred between execute_model() and """Ephemeral cached state transferred between execute_model() and
sample_tokens(), after execute_model() returns None.""" sample_tokens(), after execute_model() returns None."""
@@ -2326,8 +2322,7 @@ class NPUModelRunner(GPUModelRunner):
model_register(self.model, self.model_config) model_register(self.model, self.model_config)
if self.drafter: if self.drafter:
logger.info("Loading drafter model...") logger.info("Loading drafter model...")
with get_tp_context(self.drafter): self.drafter.load_model(self.model)
self.drafter.load_model(self.model)
if self.use_aux_hidden_state_outputs: if self.use_aux_hidden_state_outputs:
self.model.set_aux_hidden_state_layers( self.model.set_aux_hidden_state_layers(
self.model.get_eagle3_aux_hidden_state_layers()) self.model.get_eagle3_aux_hidden_state_layers())
@@ -2703,15 +2698,11 @@ class NPUModelRunner(GPUModelRunner):
kernel_block_sizes = [] kernel_block_sizes = []
for kv_cache_group_id, kv_cache_group in enumerate( for kv_cache_group_id, kv_cache_group in enumerate(
kv_cache_config.kv_cache_groups): kv_cache_config.kv_cache_groups):
kv_cache_spec = kv_cache_group.kv_cache_spec
if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs): if isinstance(kv_cache_group.kv_cache_spec,
# All layers in the UniformTypeKVCacheSpecs have the same type, EncoderOnlyAttentionSpec):
# Pick an arbitrary one to dispatch.
kv_cache_spec = next(
iter(kv_cache_spec.kv_cache_specs.values()))
if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
continue continue
elif isinstance(kv_cache_spec, AttentionSpec): elif isinstance(kv_cache_group.kv_cache_spec, AttentionSpec):
# This is an attention backend that supports virtual # This is an attention backend that supports virtual
# block splitting. Get the supported block sizes from # block splitting. Get the supported block sizes from
# the backend. # the backend.