Revert "[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#5519)"(#5902)

This reverts commit d886b81971. it breaks pd function

- vLLM version: v0.13.0
- vLLM main:
bde38c11df

Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
This commit is contained in:
zhaomingyu13
2026-01-14 20:55:10 +08:00
committed by GitHub
parent 2a6d95c389
commit 01805fbd7d
6 changed files with 11 additions and 61 deletions

View File

@@ -165,10 +165,6 @@ def graph_capture(device: torch.device):
yield graph_capture_context
def get_tp_context(drafter):
return getattr(drafter, "tp_group_context", nullcontext())
class ExecuteModelState(NamedTuple):
"""Ephemeral cached state transferred between execute_model() and
sample_tokens(), after execute_model() returns None."""
@@ -2326,8 +2322,7 @@ class NPUModelRunner(GPUModelRunner):
model_register(self.model, self.model_config)
if self.drafter:
logger.info("Loading drafter model...")
with get_tp_context(self.drafter):
self.drafter.load_model(self.model)
self.drafter.load_model(self.model)
if self.use_aux_hidden_state_outputs:
self.model.set_aux_hidden_state_layers(
self.model.get_eagle3_aux_hidden_state_layers())
@@ -2703,15 +2698,11 @@ class NPUModelRunner(GPUModelRunner):
kernel_block_sizes = []
for kv_cache_group_id, kv_cache_group in enumerate(
kv_cache_config.kv_cache_groups):
kv_cache_spec = kv_cache_group.kv_cache_spec
if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
# All layers in the UniformTypeKVCacheSpecs have the same type,
# Pick an arbitrary one to dispatch.
kv_cache_spec = next(
iter(kv_cache_spec.kv_cache_specs.values()))
if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
if isinstance(kv_cache_group.kv_cache_spec,
EncoderOnlyAttentionSpec):
continue
elif isinstance(kv_cache_spec, AttentionSpec):
elif isinstance(kv_cache_group.kv_cache_spec, AttentionSpec):
# This is an attention backend that supports virtual
# block splitting. Get the supported block sizes from
# the backend.