[Bugfix] fix eagle proposer (#4971)
### What this PR does / why we need it?
After https://github.com/vllm-project/vllm-ascend/pull/4764, a lot of
tensor created by `make_buffer` should be renamed, like `input_ids` ->
`input_ids.gpu`.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -169,7 +169,7 @@ class EagleProposer(Proposer):
|
|||||||
eagle_attn_metadata = attn_metadata[self.attn_layer_name]
|
eagle_attn_metadata = attn_metadata[self.attn_layer_name]
|
||||||
if spec_decode_metadata is None:
|
if spec_decode_metadata is None:
|
||||||
# input_ids can be None for multimodal models.
|
# input_ids can be None for multimodal models.
|
||||||
target_token_ids = self.runner.input_ids[:num_scheduled_tokens]
|
target_token_ids = self.runner.input_ids.gpu[:num_scheduled_tokens]
|
||||||
target_positions = positions[:num_scheduled_tokens]
|
target_positions = positions[:num_scheduled_tokens]
|
||||||
if self.name == SpecDcodeType.EAGLE3:
|
if self.name == SpecDcodeType.EAGLE3:
|
||||||
target_hidden_states = torch.cat(
|
target_hidden_states = torch.cat(
|
||||||
@@ -192,7 +192,7 @@ class EagleProposer(Proposer):
|
|||||||
)
|
)
|
||||||
cu_num_tokens, token_indices =\
|
cu_num_tokens, token_indices =\
|
||||||
self._prepare_inputs(eagle_attn_metadata, num_rejected_tokens)
|
self._prepare_inputs(eagle_attn_metadata, num_rejected_tokens)
|
||||||
target_token_ids = self.runner.input_ids[token_indices]
|
target_token_ids = self.runner.input_ids.gpu[token_indices]
|
||||||
target_positions = positions[token_indices]
|
target_positions = positions[token_indices]
|
||||||
if self.name == SpecDcodeType.EAGLE3:
|
if self.name == SpecDcodeType.EAGLE3:
|
||||||
target_hidden_states = torch.cat(
|
target_hidden_states = torch.cat(
|
||||||
@@ -245,7 +245,7 @@ class EagleProposer(Proposer):
|
|||||||
num_scheduled_tokens)
|
num_scheduled_tokens)
|
||||||
|
|
||||||
# Get positions.
|
# Get positions.
|
||||||
positions_np = self.runner.positions_np[:total_num_scheduled_tokens]
|
positions_np = self.runner.positions.np[:total_num_scheduled_tokens]
|
||||||
np.add(self.runner.input_batch.num_computed_tokens_cpu[req_indices],
|
np.add(self.runner.input_batch.num_computed_tokens_cpu[req_indices],
|
||||||
arange,
|
arange,
|
||||||
out=positions_np)
|
out=positions_np)
|
||||||
@@ -270,7 +270,7 @@ class EagleProposer(Proposer):
|
|||||||
self.runner.input_batch.token_ids_cpu_tensor.flatten(),
|
self.runner.input_batch.token_ids_cpu_tensor.flatten(),
|
||||||
0,
|
0,
|
||||||
torch.from_numpy(token_indices),
|
torch.from_numpy(token_indices),
|
||||||
out=self.runner.input_ids_cpu[:total_num_scheduled_tokens])
|
out=self.runner.input_ids.cpu[:total_num_scheduled_tokens])
|
||||||
|
|
||||||
# Prepare the attention metadata for each KV cache group and make layers
|
# Prepare the attention metadata for each KV cache group and make layers
|
||||||
# in the same group share the same metadata.
|
# in the same group share the same metadata.
|
||||||
@@ -299,40 +299,41 @@ class EagleProposer(Proposer):
|
|||||||
np.add(
|
np.add(
|
||||||
block_numbers * block_size,
|
block_numbers * block_size,
|
||||||
block_offsets,
|
block_offsets,
|
||||||
out=block_table.slot_mapping_np[:total_num_scheduled_tokens])
|
out=block_table.slot_mapping.np[:total_num_scheduled_tokens])
|
||||||
|
|
||||||
# Prepare the attention metadata.
|
# Prepare the attention metadata.
|
||||||
self.runner.query_start_loc_np[0] = 0
|
self.runner.query_start_loc.np[0] = 0
|
||||||
self.runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
|
self.runner.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
|
||||||
|
|
||||||
self.runner.seq_lens_np[:num_reqs] = (
|
self.runner.seq_lens.np[:num_reqs] = (
|
||||||
self.runner.input_batch.num_computed_tokens_cpu[:num_reqs] +
|
self.runner.input_batch.num_computed_tokens_cpu[:num_reqs] +
|
||||||
num_scheduled_tokens)
|
num_scheduled_tokens)
|
||||||
|
|
||||||
# Copy the tensors to the NPU.
|
# Copy the tensors to the NPU.
|
||||||
self.runner.input_ids[:total_num_scheduled_tokens].copy_(
|
self.runner.input_ids.gpu[:total_num_scheduled_tokens].copy_(
|
||||||
self.runner.input_ids_cpu[:total_num_scheduled_tokens],
|
self.runner.input_ids.cpu[:total_num_scheduled_tokens],
|
||||||
non_blocking=True)
|
non_blocking=True)
|
||||||
if self.runner.uses_mrope:
|
if self.runner.uses_mrope:
|
||||||
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||||
self.runner.mrope_positions[:, :total_num_scheduled_tokens].copy_(
|
self.runner.mrope_positions.gpu[:, :total_num_scheduled_tokens] \
|
||||||
self.runner.
|
.copy_(
|
||||||
mrope_positions_cpu[:, :total_num_scheduled_tokens],
|
self.runner.
|
||||||
non_blocking=True)
|
mrope_positions.cpu[:, :total_num_scheduled_tokens],
|
||||||
|
non_blocking=True)
|
||||||
else:
|
else:
|
||||||
# Common case (1D positions)
|
# Common case (1D positions)
|
||||||
self.runner.positions[:total_num_scheduled_tokens].copy_(
|
self.runner.positions.gpu[:total_num_scheduled_tokens].copy_(
|
||||||
self.runner.positions_cpu[:total_num_scheduled_tokens],
|
self.runner.positions.cpu[:total_num_scheduled_tokens],
|
||||||
non_blocking=True)
|
non_blocking=True)
|
||||||
|
|
||||||
self.runner.query_start_loc[:num_reqs + 1].copy_(
|
self.runner.query_start_loc.gpu[:num_reqs + 1].copy_(
|
||||||
self.runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
|
self.runner.query_start_loc.cpu[:num_reqs + 1], non_blocking=True)
|
||||||
self.runner.seq_lens[:num_reqs].copy_(
|
self.runner.seq_lens.gpu[:num_reqs].copy_(
|
||||||
self.runner.seq_lens_cpu[:num_reqs], non_blocking=True)
|
self.runner.seq_lens.cpu[:num_reqs], non_blocking=True)
|
||||||
|
|
||||||
# Fill unused with -1. Needed for reshape_and_cache
|
# Fill unused with -1. Needed for reshape_and_cache
|
||||||
self.runner.seq_lens[num_reqs:].fill_(0)
|
self.runner.seq_lens.gpu[num_reqs:].fill_(0)
|
||||||
self.runner.query_start_loc[num_reqs + 1:].fill_(-1)
|
self.runner.query_start_loc.gpu[num_reqs + 1:].fill_(-1)
|
||||||
|
|
||||||
attn_metadata = {}
|
attn_metadata = {}
|
||||||
# Prepare the attention metadata for each KV cache group and make layers
|
# Prepare the attention metadata for each KV cache group and make layers
|
||||||
@@ -340,10 +341,10 @@ class EagleProposer(Proposer):
|
|||||||
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
||||||
self.runner.kv_cache_config.kv_cache_groups):
|
self.runner.kv_cache_config.kv_cache_groups):
|
||||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||||
query_start_loc=self.runner.query_start_loc[:num_reqs + 1],
|
query_start_loc=self.runner.query_start_loc.gpu[:num_reqs + 1],
|
||||||
query_start_loc_cpu=self.runner.query_start_loc_cpu[:num_reqs +
|
query_start_loc_cpu=self.runner.query_start_loc.cpu[:num_reqs +
|
||||||
1],
|
1],
|
||||||
seq_lens_cpu=self.runner.seq_lens_cpu,
|
seq_lens_cpu=self.runner.seq_lens.cpu,
|
||||||
num_reqs=num_reqs,
|
num_reqs=num_reqs,
|
||||||
max_query_len=max_num_scheduled_tokens,
|
max_query_len=max_num_scheduled_tokens,
|
||||||
num_actual_tokens=total_num_scheduled_tokens,
|
num_actual_tokens=total_num_scheduled_tokens,
|
||||||
@@ -351,8 +352,8 @@ class EagleProposer(Proposer):
|
|||||||
block_table_tensor=self.runner.input_batch.block_table[0].
|
block_table_tensor=self.runner.input_batch.block_table[0].
|
||||||
get_device_tensor(),
|
get_device_tensor(),
|
||||||
slot_mapping=self.runner.input_batch.block_table[0].
|
slot_mapping=self.runner.input_batch.block_table[0].
|
||||||
slot_mapping,
|
slot_mapping.gpu,
|
||||||
positions=self.runner.positions,
|
positions=self.runner.positions.gpu,
|
||||||
attn_mask=self.runner.attn_mask,
|
attn_mask=self.runner.attn_mask,
|
||||||
spec_attn_mask=self.runner.spec_attn_mask,
|
spec_attn_mask=self.runner.spec_attn_mask,
|
||||||
attn_state=self.runner.attn_state,
|
attn_state=self.runner.attn_state,
|
||||||
|
|||||||
Reference in New Issue
Block a user