[Bugfix] Fix padding logic in eagle proposer for kimi25 (#7348)

### What this PR does / why we need it?
This PR aims to fix padding logic in eagle proposer for kimi25. Main
changes involve:
1. modify the way to obtain draft model attention builder and backend
2. add block table padding & related tensor slicing in common metadata
when `draft_step>1` for solving fia verifying error
3. replace block table in `update_graph_params` for solving fia
verifying error

- vLLM version: v0.17.0
- vLLM main:
4034c3d32e

Signed-off-by: Zetong Li <slippersss@126.com>
This commit is contained in:
Zetong Li
2026-03-21 16:57:22 +08:00
committed by GitHub
parent f482c314cf
commit 84a74f0cb1
4 changed files with 51 additions and 29 deletions

View File

@@ -215,6 +215,7 @@ class ExecuteModelState(NamedTuple):
positions: torch.Tensor
ec_connector_output: "ECConnectorOutput | None"
cudagraph_stats: CUDAGraphStat | None
batch_desc: BatchDescriptor
class NPUModelRunner(GPUModelRunner):
@@ -995,6 +996,7 @@ class NPUModelRunner(GPUModelRunner):
hidden_states: torch.Tensor,
aux_hidden_states: torch.Tensor = None,
sample_hidden_states: torch.Tensor = None,
target_model_batch_desc: BatchDescriptor = None,
) -> list[list[int]] | None:
if not self.drafter:
# Speculative decoding is not enabled.
@@ -1115,6 +1117,7 @@ class NPUModelRunner(GPUModelRunner):
next_token_ids=next_token_ids,
token_indices_to_sample=token_indices_to_sample,
common_attn_metadata=common_attn_metadata,
target_model_batch_desc=target_model_batch_desc,
sampling_metadata=sampling_metadata,
req_scheduled_tokens=req_scheduled_tokens,
long_seq_metadata=long_seq_metadata,
@@ -1455,6 +1458,7 @@ class NPUModelRunner(GPUModelRunner):
positions,
ec_connector_output,
cudagraph_stats,
batch_desc,
)
self.kv_connector_output = kv_connector_output
return None
@@ -1497,6 +1501,7 @@ class NPUModelRunner(GPUModelRunner):
positions,
ec_connector_output,
cudagraph_stats,
batch_desc,
) = self.execute_model_state
# Clear ephemeral state.
self.execute_model_state = None
@@ -1533,6 +1538,7 @@ class NPUModelRunner(GPUModelRunner):
hidden_states,
aux_hidden_states,
sample_hidden_states,
batch_desc,
)
self._copy_draft_token_ids_to_cpu(scheduler_output)