[Feat] Support async_scheduler and disable_padded_drafter_batch in eagle (#4893)
### What this PR does / why we need it?
We refactored the eagle_proposer.py to adapt the framework of eagle.py
in vllm-v0.12.0, to support the logit of padded drafter batch and
async-scheduler.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
Co-authored-by: drslark <slarksblood@qq.com>
This commit is contained in:
@@ -801,7 +801,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self.requests[r].num_tokens for r in self.input_batch.req_ids
|
||||
]
|
||||
num_tokens_np = np.array(num_tokens, dtype=np.int32)
|
||||
num_reqs = self.input_batch.num_reqs
|
||||
base_num_reqs = self.input_batch.num_reqs
|
||||
num_reqs = base_num_reqs
|
||||
if self.pcp_size > 1:
|
||||
# while pcp > 1, we need the original num_scheduled_tokens before split
|
||||
# to calculate discard_requests_mask
|
||||
@@ -1106,6 +1107,11 @@ class NPUModelRunner(GPUModelRunner):
|
||||
if self.speculative_config and \
|
||||
self.spec_decode_common_attn_metadata is None:
|
||||
self.spec_decode_common_attn_metadata = common_attn_metadata
|
||||
if self.speculative_config.method in ("eagle", "eagle3") and \
|
||||
self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs():
|
||||
self.spec_decode_common_attn_metadata = \
|
||||
self.spec_decode_common_attn_metadata.unpadded(
|
||||
total_num_scheduled_tokens, base_num_reqs)
|
||||
|
||||
for attn_group in self.attn_groups[kv_cache_group_id]:
|
||||
common_prefix_len = 0
|
||||
@@ -1591,7 +1597,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
with ProfileExecuteDuration().capture_async("Draft"):
|
||||
if self.speculative_config:
|
||||
use_padded_batch_for_eagle = self.speculative_config and \
|
||||
self.speculative_config.method == "mtp" and \
|
||||
self.speculative_config.use_eagle() and \
|
||||
not self.speculative_config.disable_padded_drafter_batch
|
||||
if use_padded_batch_for_eagle:
|
||||
# EAGLE speculative decoding can use the GPU sampled tokens
|
||||
|
||||
Reference in New Issue
Block a user