[Core] Cherry pick from 0.7.1 to keep the main code newest (#127)

Cherry pick from 0.7.1 to keep the main code newest

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-02-21 17:07:37 +08:00
committed by GitHub
parent 36991b2052
commit 5f465010de
11 changed files with 1136 additions and 353 deletions

View File

@@ -53,7 +53,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists,
is_pin_memory_available, make_tensor_with_pad)
is_pin_memory_available)
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
_add_attn_metadata_broadcastable_dict,
@@ -511,50 +511,21 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
for data in self.inter_data_list
}
batch_size = len(input_tokens)
if self.inter_data_list[0].is_prompt:
input_tokens_tensor = make_tensor_with_pad(
input_tokens, 0, dtype=torch.int, device=self.runner.device)
input_tokens_tensor = torch.flatten(input_tokens_tensor)
if mrope_input_positions is not None:
mrope_input_positions_tensor = make_tensor_with_pad(
mrope_input_positions,
0,
dtype=torch.int,
device=self.runner.device)
input_positions_tensor = torch.tensor(
mrope_input_positions_tensor,
dtype=torch.long,
device=self.runner.device)
else:
input_positions_tensor = make_tensor_with_pad(
input_positions,
0,
dtype=torch.int,
device=self.runner.device)
input_positions_tensor = torch.flatten(input_positions_tensor)
max_seq_len = max(seq_lens)
seq_lens = len(seq_lens) * [max_seq_len]
input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
dtype=torch.long,
device=self.runner.device)
if mrope_input_positions is not None:
input_positions_tensor = torch.tensor(mrope_input_positions,
dtype=torch.long,
device=self.runner.device)
else:
input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
dtype=torch.long,
device=self.runner.device)
if mrope_input_positions is not None:
input_positions_tensor = torch.tensor(
mrope_input_positions,
dtype=torch.long,
device=self.runner.device)
else:
input_positions_tensor = torch.tensor(
flatten_2d_lists(input_positions),
dtype=torch.long,
device=self.runner.device)
input_positions_tensor = torch.tensor(
flatten_2d_lists(input_positions),
dtype=torch.long,
device=self.runner.device)
# Attention metadata.
attn_metadata = self.attn_metadata_builder.build(
seq_lens, query_lens, -1, batch_size)
attn_metadata = self.attn_metadata_builder.build(seq_lens, query_lens)
# Multi-modal data.
multi_modal_kwargs_list = [
@@ -749,10 +720,14 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
mrope_input_positions, mrope_position_delta = \
MRotaryEmbedding.get_input_positions(
token_ids,
hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=None,
image_token_id=hf_config.image_token_id,
video_token_id=hf_config.video_token_id,
vision_start_token_id=hf_config.vision_start_token_id,
vision_end_token_id=hf_config.vision_end_token_id,
spatial_merge_size=hf_config.vision_config.
spatial_merge_size,
context_len=inter_data.context_lens[seq_idx],
seq_len=inter_data.seq_lens[seq_idx],
)