diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 27a7f717..47dc823d 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -169,7 +169,7 @@ class EagleProposer(Proposer): eagle_attn_metadata = attn_metadata[self.attn_layer_name] if spec_decode_metadata is None: # input_ids can be None for multimodal models. - target_token_ids = self.runner.input_ids[:num_scheduled_tokens] + target_token_ids = self.runner.input_ids.gpu[:num_scheduled_tokens] target_positions = positions[:num_scheduled_tokens] if self.name == SpecDcodeType.EAGLE3: target_hidden_states = torch.cat( @@ -192,7 +192,7 @@ class EagleProposer(Proposer): ) cu_num_tokens, token_indices =\ self._prepare_inputs(eagle_attn_metadata, num_rejected_tokens) - target_token_ids = self.runner.input_ids[token_indices] + target_token_ids = self.runner.input_ids.gpu[token_indices] target_positions = positions[token_indices] if self.name == SpecDcodeType.EAGLE3: target_hidden_states = torch.cat( @@ -245,7 +245,7 @@ class EagleProposer(Proposer): num_scheduled_tokens) # Get positions. - positions_np = self.runner.positions_np[:total_num_scheduled_tokens] + positions_np = self.runner.positions.np[:total_num_scheduled_tokens] np.add(self.runner.input_batch.num_computed_tokens_cpu[req_indices], arange, out=positions_np) @@ -270,7 +270,7 @@ class EagleProposer(Proposer): self.runner.input_batch.token_ids_cpu_tensor.flatten(), 0, torch.from_numpy(token_indices), - out=self.runner.input_ids_cpu[:total_num_scheduled_tokens]) + out=self.runner.input_ids.cpu[:total_num_scheduled_tokens]) # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. @@ -299,40 +299,41 @@ class EagleProposer(Proposer): np.add( block_numbers * block_size, block_offsets, - out=block_table.slot_mapping_np[:total_num_scheduled_tokens]) + out=block_table.slot_mapping.np[:total_num_scheduled_tokens]) # Prepare the attention metadata. - self.runner.query_start_loc_np[0] = 0 - self.runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens + self.runner.query_start_loc.np[0] = 0 + self.runner.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens - self.runner.seq_lens_np[:num_reqs] = ( + self.runner.seq_lens.np[:num_reqs] = ( self.runner.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens) # Copy the tensors to the NPU. - self.runner.input_ids[:total_num_scheduled_tokens].copy_( - self.runner.input_ids_cpu[:total_num_scheduled_tokens], + self.runner.input_ids.gpu[:total_num_scheduled_tokens].copy_( + self.runner.input_ids.cpu[:total_num_scheduled_tokens], non_blocking=True) if self.runner.uses_mrope: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) - self.runner.mrope_positions[:, :total_num_scheduled_tokens].copy_( - self.runner. - mrope_positions_cpu[:, :total_num_scheduled_tokens], - non_blocking=True) + self.runner.mrope_positions.gpu[:, :total_num_scheduled_tokens] \ + .copy_( + self.runner. + mrope_positions.cpu[:, :total_num_scheduled_tokens], + non_blocking=True) else: # Common case (1D positions) - self.runner.positions[:total_num_scheduled_tokens].copy_( - self.runner.positions_cpu[:total_num_scheduled_tokens], + self.runner.positions.gpu[:total_num_scheduled_tokens].copy_( + self.runner.positions.cpu[:total_num_scheduled_tokens], non_blocking=True) - self.runner.query_start_loc[:num_reqs + 1].copy_( - self.runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True) - self.runner.seq_lens[:num_reqs].copy_( - self.runner.seq_lens_cpu[:num_reqs], non_blocking=True) + self.runner.query_start_loc.gpu[:num_reqs + 1].copy_( + self.runner.query_start_loc.cpu[:num_reqs + 1], non_blocking=True) + self.runner.seq_lens.gpu[:num_reqs].copy_( + self.runner.seq_lens.cpu[:num_reqs], non_blocking=True) # Fill unused with -1. Needed for reshape_and_cache - self.runner.seq_lens[num_reqs:].fill_(0) - self.runner.query_start_loc[num_reqs + 1:].fill_(-1) + self.runner.seq_lens.gpu[num_reqs:].fill_(0) + self.runner.query_start_loc.gpu[num_reqs + 1:].fill_(-1) attn_metadata = {} # Prepare the attention metadata for each KV cache group and make layers @@ -340,10 +341,10 @@ class EagleProposer(Proposer): for kv_cache_group_id, kv_cache_group_spec in enumerate( self.runner.kv_cache_config.kv_cache_groups): common_attn_metadata = AscendCommonAttentionMetadata( - query_start_loc=self.runner.query_start_loc[:num_reqs + 1], - query_start_loc_cpu=self.runner.query_start_loc_cpu[:num_reqs + + query_start_loc=self.runner.query_start_loc.gpu[:num_reqs + 1], + query_start_loc_cpu=self.runner.query_start_loc.cpu[:num_reqs + 1], - seq_lens_cpu=self.runner.seq_lens_cpu, + seq_lens_cpu=self.runner.seq_lens.cpu, num_reqs=num_reqs, max_query_len=max_num_scheduled_tokens, num_actual_tokens=total_num_scheduled_tokens, @@ -351,8 +352,8 @@ class EagleProposer(Proposer): block_table_tensor=self.runner.input_batch.block_table[0]. get_device_tensor(), slot_mapping=self.runner.input_batch.block_table[0]. - slot_mapping, - positions=self.runner.positions, + slot_mapping.gpu, + positions=self.runner.positions.gpu, attn_mask=self.runner.attn_mask, spec_attn_mask=self.runner.spec_attn_mask, attn_state=self.runner.attn_state,