[Bugfix] Fix padding logic in eagle proposer for kimi25 (#7348)

### What this PR does / why we need it? This PR aims to fix padding logic in eagle proposer for kimi25. Main changes involve: 1. modify the way to obtain draft model attention builder and backend 2. add block table padding & related tensor slicing in common metadata when `draft_step>1` for solving fia verifying error 3. replace block table in `update_graph_params` for solving fia verifying error - vLLM version: v0.17.0 - vLLM main: 4034c3d32e Signed-off-by: Zetong Li <slippersss@126.com>
2026-03-21 16:57:22 +08:00
parent f482c314cf
commit 84a74f0cb1
4 changed files with 51 additions and 29 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -215,6 +215,7 @@ class ExecuteModelState(NamedTuple):
    positions: torch.Tensor
    ec_connector_output: "ECConnectorOutput | None"
    cudagraph_stats: CUDAGraphStat | None
+    batch_desc: BatchDescriptor


 class NPUModelRunner(GPUModelRunner):
@@ -995,6 +996,7 @@ class NPUModelRunner(GPUModelRunner):
        hidden_states: torch.Tensor,
        aux_hidden_states: torch.Tensor = None,
        sample_hidden_states: torch.Tensor = None,
+        target_model_batch_desc: BatchDescriptor = None,
    ) -> list[list[int]] | None:
        if not self.drafter:
            # Speculative decoding is not enabled.
@@ -1115,6 +1117,7 @@ class NPUModelRunner(GPUModelRunner):
                next_token_ids=next_token_ids,
                token_indices_to_sample=token_indices_to_sample,
                common_attn_metadata=common_attn_metadata,
+                target_model_batch_desc=target_model_batch_desc,
                sampling_metadata=sampling_metadata,
                req_scheduled_tokens=req_scheduled_tokens,
                long_seq_metadata=long_seq_metadata,
@@ -1455,6 +1458,7 @@ class NPUModelRunner(GPUModelRunner):
                positions,
                ec_connector_output,
                cudagraph_stats,
+                batch_desc,
            )
            self.kv_connector_output = kv_connector_output
        return None
@@ -1497,6 +1501,7 @@ class NPUModelRunner(GPUModelRunner):
            positions,
            ec_connector_output,
            cudagraph_stats,
+            batch_desc,
        ) = self.execute_model_state
        # Clear ephemeral state.
        self.execute_model_state = None
@@ -1533,6 +1538,7 @@ class NPUModelRunner(GPUModelRunner):
                hidden_states,
                aux_hidden_states,
                sample_hidden_states,
+                batch_desc,
            )
            self._copy_draft_token_ids_to_cpu(scheduler_output)