[Feat] Support async_scheduler and disable_padded_drafter_batch in eagle (#4893)

### What this PR does / why we need it?
We refactored the eagle_proposer.py to adapt the framework of eagle.py
in vllm-v0.12.0, to support the logit of padded drafter batch and
async-scheduler.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
Co-authored-by: drslark <slarksblood@qq.com>
This commit is contained in:
anon189Ty
2025-12-16 22:06:40 +08:00
committed by GitHub
parent cee521bad5
commit 5b1da4e914
6 changed files with 577 additions and 403 deletions

View File

@@ -801,7 +801,8 @@ class NPUModelRunner(GPUModelRunner):
self.requests[r].num_tokens for r in self.input_batch.req_ids
]
num_tokens_np = np.array(num_tokens, dtype=np.int32)
num_reqs = self.input_batch.num_reqs
base_num_reqs = self.input_batch.num_reqs
num_reqs = base_num_reqs
if self.pcp_size > 1:
# while pcp > 1, we need the original num_scheduled_tokens before split
# to calculate discard_requests_mask
@@ -1106,6 +1107,11 @@ class NPUModelRunner(GPUModelRunner):
if self.speculative_config and \
self.spec_decode_common_attn_metadata is None:
self.spec_decode_common_attn_metadata = common_attn_metadata
if self.speculative_config.method in ("eagle", "eagle3") and \
self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs():
self.spec_decode_common_attn_metadata = \
self.spec_decode_common_attn_metadata.unpadded(
total_num_scheduled_tokens, base_num_reqs)
for attn_group in self.attn_groups[kv_cache_group_id]:
common_prefix_len = 0
@@ -1591,7 +1597,7 @@ class NPUModelRunner(GPUModelRunner):
with ProfileExecuteDuration().capture_async("Draft"):
if self.speculative_config:
use_padded_batch_for_eagle = self.speculative_config and \
self.speculative_config.method == "mtp" and \
self.speculative_config.use_eagle() and \
not self.speculative_config.disable_padded_drafter_batch
if use_padded_batch_for_eagle:
# EAGLE speculative decoding can use the GPU sampled tokens