[Core] Cherry pick from 0.7.1 to keep the main code newest (#127)
Cherry pick from 0.7.1 to keep the main code newest Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -53,7 +53,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
||||
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists,
|
||||
is_pin_memory_available, make_tensor_with_pad)
|
||||
is_pin_memory_available)
|
||||
from vllm.worker.model_runner_base import (
|
||||
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
|
||||
_add_attn_metadata_broadcastable_dict,
|
||||
@@ -511,50 +511,21 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
|
||||
for data in self.inter_data_list
|
||||
}
|
||||
|
||||
batch_size = len(input_tokens)
|
||||
|
||||
if self.inter_data_list[0].is_prompt:
|
||||
input_tokens_tensor = make_tensor_with_pad(
|
||||
input_tokens, 0, dtype=torch.int, device=self.runner.device)
|
||||
input_tokens_tensor = torch.flatten(input_tokens_tensor)
|
||||
if mrope_input_positions is not None:
|
||||
mrope_input_positions_tensor = make_tensor_with_pad(
|
||||
mrope_input_positions,
|
||||
0,
|
||||
dtype=torch.int,
|
||||
device=self.runner.device)
|
||||
input_positions_tensor = torch.tensor(
|
||||
mrope_input_positions_tensor,
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
else:
|
||||
input_positions_tensor = make_tensor_with_pad(
|
||||
input_positions,
|
||||
0,
|
||||
dtype=torch.int,
|
||||
device=self.runner.device)
|
||||
input_positions_tensor = torch.flatten(input_positions_tensor)
|
||||
|
||||
max_seq_len = max(seq_lens)
|
||||
seq_lens = len(seq_lens) * [max_seq_len]
|
||||
input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
if mrope_input_positions is not None:
|
||||
input_positions_tensor = torch.tensor(mrope_input_positions,
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
else:
|
||||
input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
if mrope_input_positions is not None:
|
||||
input_positions_tensor = torch.tensor(
|
||||
mrope_input_positions,
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
else:
|
||||
input_positions_tensor = torch.tensor(
|
||||
flatten_2d_lists(input_positions),
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
input_positions_tensor = torch.tensor(
|
||||
flatten_2d_lists(input_positions),
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
|
||||
# Attention metadata.
|
||||
attn_metadata = self.attn_metadata_builder.build(
|
||||
seq_lens, query_lens, -1, batch_size)
|
||||
attn_metadata = self.attn_metadata_builder.build(seq_lens, query_lens)
|
||||
|
||||
# Multi-modal data.
|
||||
multi_modal_kwargs_list = [
|
||||
@@ -749,10 +720,14 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
|
||||
mrope_input_positions, mrope_position_delta = \
|
||||
MRotaryEmbedding.get_input_positions(
|
||||
token_ids,
|
||||
hf_config,
|
||||
image_grid_thw=image_grid_thw,
|
||||
video_grid_thw=video_grid_thw,
|
||||
second_per_grid_ts=None,
|
||||
image_token_id=hf_config.image_token_id,
|
||||
video_token_id=hf_config.video_token_id,
|
||||
vision_start_token_id=hf_config.vision_start_token_id,
|
||||
vision_end_token_id=hf_config.vision_end_token_id,
|
||||
spatial_merge_size=hf_config.vision_config.
|
||||
spatial_merge_size,
|
||||
context_len=inter_data.context_lens[seq_idx],
|
||||
seq_len=inter_data.seq_lens[seq_idx],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user