[Bugfix] fix eagle proposer (#4971)

### What this PR does / why we need it?
After https://github.com/vllm-project/vllm-ascend/pull/4764, a lot of
tensor created by `make_buffer` should be renamed, like `input_ids` ->
`input_ids.gpu`.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
realliujiaxu
2025-12-12 22:39:49 +08:00
committed by GitHub
parent 45889a6185
commit 3581946256

View File

@@ -169,7 +169,7 @@ class EagleProposer(Proposer):
eagle_attn_metadata = attn_metadata[self.attn_layer_name] eagle_attn_metadata = attn_metadata[self.attn_layer_name]
if spec_decode_metadata is None: if spec_decode_metadata is None:
# input_ids can be None for multimodal models. # input_ids can be None for multimodal models.
target_token_ids = self.runner.input_ids[:num_scheduled_tokens] target_token_ids = self.runner.input_ids.gpu[:num_scheduled_tokens]
target_positions = positions[:num_scheduled_tokens] target_positions = positions[:num_scheduled_tokens]
if self.name == SpecDcodeType.EAGLE3: if self.name == SpecDcodeType.EAGLE3:
target_hidden_states = torch.cat( target_hidden_states = torch.cat(
@@ -192,7 +192,7 @@ class EagleProposer(Proposer):
) )
cu_num_tokens, token_indices =\ cu_num_tokens, token_indices =\
self._prepare_inputs(eagle_attn_metadata, num_rejected_tokens) self._prepare_inputs(eagle_attn_metadata, num_rejected_tokens)
target_token_ids = self.runner.input_ids[token_indices] target_token_ids = self.runner.input_ids.gpu[token_indices]
target_positions = positions[token_indices] target_positions = positions[token_indices]
if self.name == SpecDcodeType.EAGLE3: if self.name == SpecDcodeType.EAGLE3:
target_hidden_states = torch.cat( target_hidden_states = torch.cat(
@@ -245,7 +245,7 @@ class EagleProposer(Proposer):
num_scheduled_tokens) num_scheduled_tokens)
# Get positions. # Get positions.
positions_np = self.runner.positions_np[:total_num_scheduled_tokens] positions_np = self.runner.positions.np[:total_num_scheduled_tokens]
np.add(self.runner.input_batch.num_computed_tokens_cpu[req_indices], np.add(self.runner.input_batch.num_computed_tokens_cpu[req_indices],
arange, arange,
out=positions_np) out=positions_np)
@@ -270,7 +270,7 @@ class EagleProposer(Proposer):
self.runner.input_batch.token_ids_cpu_tensor.flatten(), self.runner.input_batch.token_ids_cpu_tensor.flatten(),
0, 0,
torch.from_numpy(token_indices), torch.from_numpy(token_indices),
out=self.runner.input_ids_cpu[:total_num_scheduled_tokens]) out=self.runner.input_ids.cpu[:total_num_scheduled_tokens])
# Prepare the attention metadata for each KV cache group and make layers # Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata. # in the same group share the same metadata.
@@ -299,40 +299,41 @@ class EagleProposer(Proposer):
np.add( np.add(
block_numbers * block_size, block_numbers * block_size,
block_offsets, block_offsets,
out=block_table.slot_mapping_np[:total_num_scheduled_tokens]) out=block_table.slot_mapping.np[:total_num_scheduled_tokens])
# Prepare the attention metadata. # Prepare the attention metadata.
self.runner.query_start_loc_np[0] = 0 self.runner.query_start_loc.np[0] = 0
self.runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens self.runner.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
self.runner.seq_lens_np[:num_reqs] = ( self.runner.seq_lens.np[:num_reqs] = (
self.runner.input_batch.num_computed_tokens_cpu[:num_reqs] + self.runner.input_batch.num_computed_tokens_cpu[:num_reqs] +
num_scheduled_tokens) num_scheduled_tokens)
# Copy the tensors to the NPU. # Copy the tensors to the NPU.
self.runner.input_ids[:total_num_scheduled_tokens].copy_( self.runner.input_ids.gpu[:total_num_scheduled_tokens].copy_(
self.runner.input_ids_cpu[:total_num_scheduled_tokens], self.runner.input_ids.cpu[:total_num_scheduled_tokens],
non_blocking=True) non_blocking=True)
if self.runner.uses_mrope: if self.runner.uses_mrope:
# Only relevant for models using M-RoPE (e.g, Qwen2-VL) # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
self.runner.mrope_positions[:, :total_num_scheduled_tokens].copy_( self.runner.mrope_positions.gpu[:, :total_num_scheduled_tokens] \
self.runner. .copy_(
mrope_positions_cpu[:, :total_num_scheduled_tokens], self.runner.
non_blocking=True) mrope_positions.cpu[:, :total_num_scheduled_tokens],
non_blocking=True)
else: else:
# Common case (1D positions) # Common case (1D positions)
self.runner.positions[:total_num_scheduled_tokens].copy_( self.runner.positions.gpu[:total_num_scheduled_tokens].copy_(
self.runner.positions_cpu[:total_num_scheduled_tokens], self.runner.positions.cpu[:total_num_scheduled_tokens],
non_blocking=True) non_blocking=True)
self.runner.query_start_loc[:num_reqs + 1].copy_( self.runner.query_start_loc.gpu[:num_reqs + 1].copy_(
self.runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True) self.runner.query_start_loc.cpu[:num_reqs + 1], non_blocking=True)
self.runner.seq_lens[:num_reqs].copy_( self.runner.seq_lens.gpu[:num_reqs].copy_(
self.runner.seq_lens_cpu[:num_reqs], non_blocking=True) self.runner.seq_lens.cpu[:num_reqs], non_blocking=True)
# Fill unused with -1. Needed for reshape_and_cache # Fill unused with -1. Needed for reshape_and_cache
self.runner.seq_lens[num_reqs:].fill_(0) self.runner.seq_lens.gpu[num_reqs:].fill_(0)
self.runner.query_start_loc[num_reqs + 1:].fill_(-1) self.runner.query_start_loc.gpu[num_reqs + 1:].fill_(-1)
attn_metadata = {} attn_metadata = {}
# Prepare the attention metadata for each KV cache group and make layers # Prepare the attention metadata for each KV cache group and make layers
@@ -340,10 +341,10 @@ class EagleProposer(Proposer):
for kv_cache_group_id, kv_cache_group_spec in enumerate( for kv_cache_group_id, kv_cache_group_spec in enumerate(
self.runner.kv_cache_config.kv_cache_groups): self.runner.kv_cache_config.kv_cache_groups):
common_attn_metadata = AscendCommonAttentionMetadata( common_attn_metadata = AscendCommonAttentionMetadata(
query_start_loc=self.runner.query_start_loc[:num_reqs + 1], query_start_loc=self.runner.query_start_loc.gpu[:num_reqs + 1],
query_start_loc_cpu=self.runner.query_start_loc_cpu[:num_reqs + query_start_loc_cpu=self.runner.query_start_loc.cpu[:num_reqs +
1], 1],
seq_lens_cpu=self.runner.seq_lens_cpu, seq_lens_cpu=self.runner.seq_lens.cpu,
num_reqs=num_reqs, num_reqs=num_reqs,
max_query_len=max_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens,
num_actual_tokens=total_num_scheduled_tokens, num_actual_tokens=total_num_scheduled_tokens,
@@ -351,8 +352,8 @@ class EagleProposer(Proposer):
block_table_tensor=self.runner.input_batch.block_table[0]. block_table_tensor=self.runner.input_batch.block_table[0].
get_device_tensor(), get_device_tensor(),
slot_mapping=self.runner.input_batch.block_table[0]. slot_mapping=self.runner.input_batch.block_table[0].
slot_mapping, slot_mapping.gpu,
positions=self.runner.positions, positions=self.runner.positions.gpu,
attn_mask=self.runner.attn_mask, attn_mask=self.runner.attn_mask,
spec_attn_mask=self.runner.spec_attn_mask, spec_attn_mask=self.runner.spec_attn_mask,
attn_state=self.runner.attn_state, attn_state=self.runner.attn_state,