[Bugfix] Fix padding logic in eagle proposer for kimi25 (#7348)
### What this PR does / why we need it?
This PR aims to fix padding logic in eagle proposer for kimi25. Main
changes involve:
1. modify the way to obtain draft model attention builder and backend
2. add block table padding & related tensor slicing in common metadata
when `draft_step>1` for solving fia verifying error
3. replace block table in `update_graph_params` for solving fia
verifying error
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
Signed-off-by: Zetong Li <slippersss@126.com>
This commit is contained in:
@@ -215,6 +215,7 @@ class ExecuteModelState(NamedTuple):
|
||||
positions: torch.Tensor
|
||||
ec_connector_output: "ECConnectorOutput | None"
|
||||
cudagraph_stats: CUDAGraphStat | None
|
||||
batch_desc: BatchDescriptor
|
||||
|
||||
|
||||
class NPUModelRunner(GPUModelRunner):
|
||||
@@ -995,6 +996,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
hidden_states: torch.Tensor,
|
||||
aux_hidden_states: torch.Tensor = None,
|
||||
sample_hidden_states: torch.Tensor = None,
|
||||
target_model_batch_desc: BatchDescriptor = None,
|
||||
) -> list[list[int]] | None:
|
||||
if not self.drafter:
|
||||
# Speculative decoding is not enabled.
|
||||
@@ -1115,6 +1117,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
next_token_ids=next_token_ids,
|
||||
token_indices_to_sample=token_indices_to_sample,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
target_model_batch_desc=target_model_batch_desc,
|
||||
sampling_metadata=sampling_metadata,
|
||||
req_scheduled_tokens=req_scheduled_tokens,
|
||||
long_seq_metadata=long_seq_metadata,
|
||||
@@ -1455,6 +1458,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
positions,
|
||||
ec_connector_output,
|
||||
cudagraph_stats,
|
||||
batch_desc,
|
||||
)
|
||||
self.kv_connector_output = kv_connector_output
|
||||
return None
|
||||
@@ -1497,6 +1501,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
positions,
|
||||
ec_connector_output,
|
||||
cudagraph_stats,
|
||||
batch_desc,
|
||||
) = self.execute_model_state
|
||||
# Clear ephemeral state.
|
||||
self.execute_model_state = None
|
||||
@@ -1533,6 +1538,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
hidden_states,
|
||||
aux_hidden_states,
|
||||
sample_hidden_states,
|
||||
batch_desc,
|
||||
)
|
||||
self._copy_draft_token_ids_to_cpu(scheduler_output)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user