[Bugfix] fix eagle proposer (#4971)
### What this PR does / why we need it?
After https://github.com/vllm-project/vllm-ascend/pull/4764, a lot of
tensor created by `make_buffer` should be renamed, like `input_ids` ->
`input_ids.gpu`.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -169,7 +169,7 @@ class EagleProposer(Proposer):
|
||||
eagle_attn_metadata = attn_metadata[self.attn_layer_name]
|
||||
if spec_decode_metadata is None:
|
||||
# input_ids can be None for multimodal models.
|
||||
target_token_ids = self.runner.input_ids[:num_scheduled_tokens]
|
||||
target_token_ids = self.runner.input_ids.gpu[:num_scheduled_tokens]
|
||||
target_positions = positions[:num_scheduled_tokens]
|
||||
if self.name == SpecDcodeType.EAGLE3:
|
||||
target_hidden_states = torch.cat(
|
||||
@@ -192,7 +192,7 @@ class EagleProposer(Proposer):
|
||||
)
|
||||
cu_num_tokens, token_indices =\
|
||||
self._prepare_inputs(eagle_attn_metadata, num_rejected_tokens)
|
||||
target_token_ids = self.runner.input_ids[token_indices]
|
||||
target_token_ids = self.runner.input_ids.gpu[token_indices]
|
||||
target_positions = positions[token_indices]
|
||||
if self.name == SpecDcodeType.EAGLE3:
|
||||
target_hidden_states = torch.cat(
|
||||
@@ -245,7 +245,7 @@ class EagleProposer(Proposer):
|
||||
num_scheduled_tokens)
|
||||
|
||||
# Get positions.
|
||||
positions_np = self.runner.positions_np[:total_num_scheduled_tokens]
|
||||
positions_np = self.runner.positions.np[:total_num_scheduled_tokens]
|
||||
np.add(self.runner.input_batch.num_computed_tokens_cpu[req_indices],
|
||||
arange,
|
||||
out=positions_np)
|
||||
@@ -270,7 +270,7 @@ class EagleProposer(Proposer):
|
||||
self.runner.input_batch.token_ids_cpu_tensor.flatten(),
|
||||
0,
|
||||
torch.from_numpy(token_indices),
|
||||
out=self.runner.input_ids_cpu[:total_num_scheduled_tokens])
|
||||
out=self.runner.input_ids.cpu[:total_num_scheduled_tokens])
|
||||
|
||||
# Prepare the attention metadata for each KV cache group and make layers
|
||||
# in the same group share the same metadata.
|
||||
@@ -299,40 +299,41 @@ class EagleProposer(Proposer):
|
||||
np.add(
|
||||
block_numbers * block_size,
|
||||
block_offsets,
|
||||
out=block_table.slot_mapping_np[:total_num_scheduled_tokens])
|
||||
out=block_table.slot_mapping.np[:total_num_scheduled_tokens])
|
||||
|
||||
# Prepare the attention metadata.
|
||||
self.runner.query_start_loc_np[0] = 0
|
||||
self.runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
|
||||
self.runner.query_start_loc.np[0] = 0
|
||||
self.runner.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
|
||||
|
||||
self.runner.seq_lens_np[:num_reqs] = (
|
||||
self.runner.seq_lens.np[:num_reqs] = (
|
||||
self.runner.input_batch.num_computed_tokens_cpu[:num_reqs] +
|
||||
num_scheduled_tokens)
|
||||
|
||||
# Copy the tensors to the NPU.
|
||||
self.runner.input_ids[:total_num_scheduled_tokens].copy_(
|
||||
self.runner.input_ids_cpu[:total_num_scheduled_tokens],
|
||||
self.runner.input_ids.gpu[:total_num_scheduled_tokens].copy_(
|
||||
self.runner.input_ids.cpu[:total_num_scheduled_tokens],
|
||||
non_blocking=True)
|
||||
if self.runner.uses_mrope:
|
||||
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||
self.runner.mrope_positions[:, :total_num_scheduled_tokens].copy_(
|
||||
self.runner.
|
||||
mrope_positions_cpu[:, :total_num_scheduled_tokens],
|
||||
non_blocking=True)
|
||||
self.runner.mrope_positions.gpu[:, :total_num_scheduled_tokens] \
|
||||
.copy_(
|
||||
self.runner.
|
||||
mrope_positions.cpu[:, :total_num_scheduled_tokens],
|
||||
non_blocking=True)
|
||||
else:
|
||||
# Common case (1D positions)
|
||||
self.runner.positions[:total_num_scheduled_tokens].copy_(
|
||||
self.runner.positions_cpu[:total_num_scheduled_tokens],
|
||||
self.runner.positions.gpu[:total_num_scheduled_tokens].copy_(
|
||||
self.runner.positions.cpu[:total_num_scheduled_tokens],
|
||||
non_blocking=True)
|
||||
|
||||
self.runner.query_start_loc[:num_reqs + 1].copy_(
|
||||
self.runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
|
||||
self.runner.seq_lens[:num_reqs].copy_(
|
||||
self.runner.seq_lens_cpu[:num_reqs], non_blocking=True)
|
||||
self.runner.query_start_loc.gpu[:num_reqs + 1].copy_(
|
||||
self.runner.query_start_loc.cpu[:num_reqs + 1], non_blocking=True)
|
||||
self.runner.seq_lens.gpu[:num_reqs].copy_(
|
||||
self.runner.seq_lens.cpu[:num_reqs], non_blocking=True)
|
||||
|
||||
# Fill unused with -1. Needed for reshape_and_cache
|
||||
self.runner.seq_lens[num_reqs:].fill_(0)
|
||||
self.runner.query_start_loc[num_reqs + 1:].fill_(-1)
|
||||
self.runner.seq_lens.gpu[num_reqs:].fill_(0)
|
||||
self.runner.query_start_loc.gpu[num_reqs + 1:].fill_(-1)
|
||||
|
||||
attn_metadata = {}
|
||||
# Prepare the attention metadata for each KV cache group and make layers
|
||||
@@ -340,10 +341,10 @@ class EagleProposer(Proposer):
|
||||
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
||||
self.runner.kv_cache_config.kv_cache_groups):
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=self.runner.query_start_loc[:num_reqs + 1],
|
||||
query_start_loc_cpu=self.runner.query_start_loc_cpu[:num_reqs +
|
||||
query_start_loc=self.runner.query_start_loc.gpu[:num_reqs + 1],
|
||||
query_start_loc_cpu=self.runner.query_start_loc.cpu[:num_reqs +
|
||||
1],
|
||||
seq_lens_cpu=self.runner.seq_lens_cpu,
|
||||
seq_lens_cpu=self.runner.seq_lens.cpu,
|
||||
num_reqs=num_reqs,
|
||||
max_query_len=max_num_scheduled_tokens,
|
||||
num_actual_tokens=total_num_scheduled_tokens,
|
||||
@@ -351,8 +352,8 @@ class EagleProposer(Proposer):
|
||||
block_table_tensor=self.runner.input_batch.block_table[0].
|
||||
get_device_tensor(),
|
||||
slot_mapping=self.runner.input_batch.block_table[0].
|
||||
slot_mapping,
|
||||
positions=self.runner.positions,
|
||||
slot_mapping.gpu,
|
||||
positions=self.runner.positions.gpu,
|
||||
attn_mask=self.runner.attn_mask,
|
||||
spec_attn_mask=self.runner.spec_attn_mask,
|
||||
attn_state=self.runner.attn_state,
|
||||
|
||||
Reference in New Issue
Block a user