Add qwen2.5 vl multimodal feature for vllm-ascend v1 (#736)
### What this PR does / why we need it? The current vllm-ascend is not support the multimodal model in vllm-ascend v1 yet. So I change the `model_runner_v1.py` file with using MRoPE feature and so on to support this feature. It currently still not perfect since the Ascend operator is not support the `window/full attn` to reduce Memcpy operations, so it would out of memory if the input embedding is too large, so We can't use `self._profile_multimodal()` for profile since it use a big dummy input (i.e. images) as the multimodal input. Fixes: https://github.com/vllm-project/vllm-ascend/issues/514 ### Does this PR introduce _any_ user-facing change? No, this feature not need change the user-facing ### How was this patch tested? I test this offline using my machine 910B3 and my own fork, and it works well. --------- Signed-off-by: cty <ctynb@qq.com>
This commit is contained in:
@@ -60,8 +60,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
|
@pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
|
||||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1",
|
|
||||||
reason="qwen2.5_vl is not supported on v1")
|
|
||||||
def test_multimodal(model, prompt_template, vllm_runner):
|
def test_multimodal(model, prompt_template, vllm_runner):
|
||||||
image = ImageAsset("cherry_blossom") \
|
image = ImageAsset("cherry_blossom") \
|
||||||
.pil_image.convert("RGB")
|
.pil_image.convert("RGB")
|
||||||
|
|||||||
@@ -42,8 +42,11 @@ from vllm.forward_context import set_forward_context
|
|||||||
from vllm.inputs import INPUT_REGISTRY
|
from vllm.inputs import INPUT_REGISTRY
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
|
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||||
from vllm.model_executor.model_loader import get_model
|
from vllm.model_executor.model_loader import get_model
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||||
|
from vllm.multimodal.utils import group_mm_inputs_by_modality
|
||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||||
@@ -61,6 +64,9 @@ from vllm.v1.spec_decode.utils import is_spec_decode_supported
|
|||||||
from vllm.v1.utils import bind_kv_cache
|
from vllm.v1.utils import bind_kv_cache
|
||||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||||
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
|
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
|
||||||
|
from vllm.v1.worker.utils import (gather_mm_placeholders,
|
||||||
|
sanity_check_mm_encoder_outputs,
|
||||||
|
scatter_mm_placeholders)
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.attention.attention import AttentionMaskBuilder
|
from vllm_ascend.attention.attention import AttentionMaskBuilder
|
||||||
@@ -362,6 +368,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# Remove finished requests from the cached states.
|
# Remove finished requests from the cached states.
|
||||||
for req_id in scheduler_output.finished_req_ids:
|
for req_id in scheduler_output.finished_req_ids:
|
||||||
self.requests.pop(req_id, None)
|
self.requests.pop(req_id, None)
|
||||||
|
self.encoder_cache.pop(req_id, None)
|
||||||
# Remove the finished requests from the persistent batch.
|
# Remove the finished requests from the persistent batch.
|
||||||
# NOTE(woosuk): There could be an edge case where finished_req_ids and
|
# NOTE(woosuk): There could be an edge case where finished_req_ids and
|
||||||
# scheduled_req_ids overlap. This happens when a request is aborted and
|
# scheduled_req_ids overlap. This happens when a request is aborted and
|
||||||
@@ -374,6 +381,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
if req_index is not None:
|
if req_index is not None:
|
||||||
removed_req_indices.append(req_index)
|
removed_req_indices.append(req_index)
|
||||||
|
|
||||||
|
# Free the cached encoder outputs.
|
||||||
|
for req_id, input_id in scheduler_output.free_encoder_input_ids:
|
||||||
|
encoder_outputs = self.encoder_cache.get(req_id)
|
||||||
|
if encoder_outputs is not None:
|
||||||
|
encoder_outputs.pop(input_id, None)
|
||||||
|
if not encoder_outputs:
|
||||||
|
self.encoder_cache.pop(req_id, None)
|
||||||
|
|
||||||
# Remove the unscheduled requests from the persistent batch.
|
# Remove the unscheduled requests from the persistent batch.
|
||||||
# NOTE(woosuk): The unscheduled requests are either preempted requests
|
# NOTE(woosuk): The unscheduled requests are either preempted requests
|
||||||
# or running requests that are not scheduled in this step. We remove
|
# or running requests that are not scheduled in this step. We remove
|
||||||
@@ -415,6 +430,43 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
lora_request=new_req_data.lora_request,
|
lora_request=new_req_data.lora_request,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||||
|
if self.uses_mrope:
|
||||||
|
image_grid_thw = []
|
||||||
|
video_grid_thw = []
|
||||||
|
second_per_grid_ts = []
|
||||||
|
audio_feature_lengths = []
|
||||||
|
use_audio_in_video = False
|
||||||
|
for mm_input in self.requests[req_id].mm_inputs:
|
||||||
|
if mm_input.get("image_grid_thw") is not None:
|
||||||
|
image_grid_thw.extend(
|
||||||
|
mm_input["image_grid_thw"].tolist())
|
||||||
|
if mm_input.get("video_grid_thw") is not None:
|
||||||
|
video_grid_thw.extend(
|
||||||
|
mm_input["video_grid_thw"].tolist())
|
||||||
|
if mm_input.get("second_per_grid_ts") is not None:
|
||||||
|
second_per_grid_ts.extend(
|
||||||
|
mm_input["second_per_grid_ts"])
|
||||||
|
if mm_input.get("audio_feature_lengths") is not None:
|
||||||
|
audio_feature_lengths.extend(
|
||||||
|
mm_input["audio_feature_lengths"])
|
||||||
|
if mm_input.get("use_audio_in_video") is True:
|
||||||
|
use_audio_in_video = True
|
||||||
|
|
||||||
|
hf_config = self.model_config.hf_config
|
||||||
|
|
||||||
|
self.requests[req_id].mrope_positions, \
|
||||||
|
self.requests[req_id].mrope_position_delta = \
|
||||||
|
MRotaryEmbedding.get_input_positions_tensor(
|
||||||
|
self.requests[req_id].prompt_token_ids,
|
||||||
|
hf_config=hf_config,
|
||||||
|
image_grid_thw=image_grid_thw,
|
||||||
|
video_grid_thw=video_grid_thw,
|
||||||
|
second_per_grid_ts=second_per_grid_ts,
|
||||||
|
audio_feature_lengths=audio_feature_lengths,
|
||||||
|
use_audio_in_video=use_audio_in_video,
|
||||||
|
)
|
||||||
|
|
||||||
req_ids_to_add.append(req_id)
|
req_ids_to_add.append(req_id)
|
||||||
|
|
||||||
# Update the states of the running/resumed requests.
|
# Update the states of the running/resumed requests.
|
||||||
@@ -535,6 +587,166 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
|
||||||
|
mrope_pos_ptr = 0
|
||||||
|
for index, req_id in enumerate(self.input_batch.req_ids):
|
||||||
|
req = self.requests[req_id]
|
||||||
|
assert req.mrope_positions is not None
|
||||||
|
|
||||||
|
num_computed_tokens = \
|
||||||
|
self.input_batch.num_computed_tokens_cpu[index]
|
||||||
|
num_scheduled_tokens = \
|
||||||
|
scheduler_output.num_scheduled_tokens[req_id]
|
||||||
|
num_prompt_tokens = len(req.prompt_token_ids)
|
||||||
|
|
||||||
|
if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
|
||||||
|
prompt_part_len = max(0,
|
||||||
|
num_prompt_tokens - num_computed_tokens)
|
||||||
|
completion_part_len = max(
|
||||||
|
0, num_scheduled_tokens - prompt_part_len)
|
||||||
|
else:
|
||||||
|
prompt_part_len = num_scheduled_tokens
|
||||||
|
completion_part_len = 0
|
||||||
|
|
||||||
|
assert num_scheduled_tokens == prompt_part_len + completion_part_len
|
||||||
|
|
||||||
|
if prompt_part_len > 0:
|
||||||
|
# prompt's mrope_positions are pre-computed
|
||||||
|
dst_start = mrope_pos_ptr
|
||||||
|
dst_end = mrope_pos_ptr + prompt_part_len
|
||||||
|
src_start = num_computed_tokens
|
||||||
|
src_end = num_computed_tokens + prompt_part_len
|
||||||
|
|
||||||
|
self.mrope_positions_cpu[:, dst_start:dst_end] = \
|
||||||
|
req.mrope_positions[:,src_start:src_end]
|
||||||
|
|
||||||
|
mrope_pos_ptr += prompt_part_len
|
||||||
|
|
||||||
|
if completion_part_len > 0:
|
||||||
|
# compute completion's mrope_positions on-the-fly
|
||||||
|
dst_start = mrope_pos_ptr
|
||||||
|
dst_end = mrope_pos_ptr + completion_part_len
|
||||||
|
|
||||||
|
self.mrope_positions_cpu[:, dst_start:dst_end] = \
|
||||||
|
MRotaryEmbedding.get_next_input_positions_tensor(
|
||||||
|
req.mrope_position_delta,
|
||||||
|
context_len=num_computed_tokens +
|
||||||
|
prompt_part_len,
|
||||||
|
seq_len=num_computed_tokens +
|
||||||
|
prompt_part_len +
|
||||||
|
completion_part_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
mrope_pos_ptr += completion_part_len
|
||||||
|
|
||||||
|
def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
|
||||||
|
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
|
||||||
|
if not scheduled_encoder_inputs:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Batch the multi-modal inputs.
|
||||||
|
mm_inputs = list[MultiModalKwargs]()
|
||||||
|
req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
|
||||||
|
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
|
||||||
|
req_state = self.requests[req_id]
|
||||||
|
|
||||||
|
for mm_input_id in encoder_input_ids:
|
||||||
|
mm_inputs.append(req_state.mm_inputs[mm_input_id])
|
||||||
|
req_ids_pos.append(
|
||||||
|
(req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
|
||||||
|
|
||||||
|
# Batch mm inputs as much as we can: if a request in the batch has
|
||||||
|
# multiple modalities or a different modality than the previous one,
|
||||||
|
# we process it separately to preserve item order.
|
||||||
|
# FIXME(ywang96): This is a hacky way to deal with multiple modalities
|
||||||
|
# in the same batch while still being able to benefit from batching
|
||||||
|
# multimodal inputs. The proper solution should be reordering the
|
||||||
|
# encoder outputs.
|
||||||
|
grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
|
||||||
|
|
||||||
|
encoder_outputs = []
|
||||||
|
for grouped_mm_inputs in grouped_mm_inputs_list:
|
||||||
|
batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
|
||||||
|
batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
|
||||||
|
device=self.device)
|
||||||
|
|
||||||
|
# Run the encoder.
|
||||||
|
# `curr_group_outputs` is either of the following:
|
||||||
|
# 1. A tensor of shape (num_items, feature_size, hidden_size)
|
||||||
|
# in case feature_size is fixed across all multimodal items.
|
||||||
|
# 2. A list or tuple (length: num_items) of tensors, each of shape
|
||||||
|
# (feature_size, hidden_size) in case the feature size is dynamic
|
||||||
|
# depending on the input multimodal items.
|
||||||
|
curr_group_outputs = self.model.get_multimodal_embeddings(
|
||||||
|
**batched_mm_inputs)
|
||||||
|
|
||||||
|
sanity_check_mm_encoder_outputs(
|
||||||
|
curr_group_outputs,
|
||||||
|
expected_num_items=len(grouped_mm_inputs),
|
||||||
|
)
|
||||||
|
|
||||||
|
for output in curr_group_outputs:
|
||||||
|
encoder_outputs.append(output)
|
||||||
|
|
||||||
|
# Cache the encoder outputs.
|
||||||
|
for (req_id, input_id, pos_info), output in zip(
|
||||||
|
req_ids_pos,
|
||||||
|
encoder_outputs,
|
||||||
|
):
|
||||||
|
if req_id not in self.encoder_cache:
|
||||||
|
self.encoder_cache[req_id] = {}
|
||||||
|
|
||||||
|
self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
|
||||||
|
output,
|
||||||
|
is_embed=pos_info.is_embed,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _gather_mm_embeddings(
|
||||||
|
self,
|
||||||
|
scheduler_output: "SchedulerOutput",
|
||||||
|
) -> list[torch.Tensor]:
|
||||||
|
mm_embeds: list[torch.Tensor] = []
|
||||||
|
for req_id in self.input_batch.req_ids:
|
||||||
|
num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
|
||||||
|
req_id]
|
||||||
|
req_state = self.requests[req_id]
|
||||||
|
num_computed_tokens = req_state.num_computed_tokens
|
||||||
|
mm_positions = req_state.mm_positions
|
||||||
|
for i, pos_info in enumerate(mm_positions):
|
||||||
|
start_pos = pos_info.offset
|
||||||
|
num_encoder_tokens = pos_info.length
|
||||||
|
|
||||||
|
# The encoder output is needed if the two ranges overlap:
|
||||||
|
# [num_computed_tokens,
|
||||||
|
# num_computed_tokens + num_scheduled_tokens) and
|
||||||
|
# [start_pos, start_pos + num_encoder_tokens)
|
||||||
|
if start_pos >= num_computed_tokens + num_scheduled_tokens:
|
||||||
|
# The encoder output is not needed in this step.
|
||||||
|
break
|
||||||
|
if start_pos + num_encoder_tokens <= num_computed_tokens:
|
||||||
|
# The encoder output is already processed and stored
|
||||||
|
# in the decoder's KV cache.
|
||||||
|
continue
|
||||||
|
|
||||||
|
start_idx = max(num_computed_tokens - start_pos, 0)
|
||||||
|
end_idx = min(
|
||||||
|
num_computed_tokens - start_pos + num_scheduled_tokens,
|
||||||
|
num_encoder_tokens)
|
||||||
|
assert start_idx < end_idx
|
||||||
|
assert req_id in self.encoder_cache
|
||||||
|
assert i in self.encoder_cache[req_id]
|
||||||
|
encoder_output = self.encoder_cache[req_id][i]
|
||||||
|
|
||||||
|
if (is_embed := pos_info.is_embed) is not None:
|
||||||
|
is_embed = is_embed[start_idx:end_idx]
|
||||||
|
|
||||||
|
mm_embeds_item = gather_mm_placeholders(
|
||||||
|
encoder_output[start_idx:end_idx],
|
||||||
|
is_embed=is_embed,
|
||||||
|
)
|
||||||
|
mm_embeds.append(mm_embeds_item)
|
||||||
|
return mm_embeds
|
||||||
|
|
||||||
def _process_reqs(
|
def _process_reqs(
|
||||||
self,
|
self,
|
||||||
scheduler_output: "SchedulerOutput",
|
scheduler_output: "SchedulerOutput",
|
||||||
@@ -594,6 +806,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
arange,
|
arange,
|
||||||
out=positions_np)
|
out=positions_np)
|
||||||
|
|
||||||
|
# Calculate M-RoPE positions.
|
||||||
|
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||||
|
if self.uses_mrope:
|
||||||
|
self._calc_mrope_positions(scheduler_output)
|
||||||
|
|
||||||
|
if self.uses_mrope:
|
||||||
|
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||||
|
self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
|
||||||
|
self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
|
||||||
|
non_blocking=True)
|
||||||
|
|
||||||
self.positions[:total_num_scheduled_tokens].copy_(
|
self.positions[:total_num_scheduled_tokens].copy_(
|
||||||
self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
|
self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
|
||||||
positions = self.positions[:num_input_tokens]
|
positions = self.positions[:num_input_tokens]
|
||||||
@@ -706,6 +929,43 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
input_ids = self.input_ids[:padded_batch_size]
|
input_ids = self.input_ids[:padded_batch_size]
|
||||||
positions = self.positions[:padded_batch_size]
|
positions = self.positions[:padded_batch_size]
|
||||||
|
|
||||||
|
# prepare the MRoPE for mllm if using multimodal
|
||||||
|
num_input_tokens = total_num_scheduled_tokens
|
||||||
|
# _prepare_inputs may reorder the batch, so we must gather multi
|
||||||
|
# modal outputs after that to ensure the correct order
|
||||||
|
if self.is_multimodal_model:
|
||||||
|
# Run the multimodal encoder if any.
|
||||||
|
self._execute_mm_encoder(scheduler_output)
|
||||||
|
mm_embeds = self._gather_mm_embeddings(scheduler_output)
|
||||||
|
else:
|
||||||
|
mm_embeds = []
|
||||||
|
|
||||||
|
if self.is_multimodal_model:
|
||||||
|
# NOTE(woosuk): To unify token ids and soft tokens (vision
|
||||||
|
# embeddings), we always use embeddings (rather than token ids)
|
||||||
|
# as input to the multimodal model, even when the input is text.
|
||||||
|
input_ids = self.input_ids[:num_input_tokens]
|
||||||
|
if mm_embeds:
|
||||||
|
inputs_embeds = self.model.get_input_embeddings(
|
||||||
|
input_ids, mm_embeds)
|
||||||
|
else:
|
||||||
|
inputs_embeds = self.model.get_input_embeddings(input_ids)
|
||||||
|
# TODO(woosuk): Avoid the copy. Optimize.
|
||||||
|
self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
|
||||||
|
inputs_embeds = self.inputs_embeds[:num_input_tokens]
|
||||||
|
input_ids = None
|
||||||
|
else:
|
||||||
|
# For text-only models, we use token ids as input.
|
||||||
|
# While it is possible to use embeddings as input just like the
|
||||||
|
# multimodal models, it is not desirable for performance since
|
||||||
|
# then the embedding layer is not included in the CUDA graph.
|
||||||
|
input_ids = self.input_ids[:num_input_tokens]
|
||||||
|
inputs_embeds = None
|
||||||
|
if self.uses_mrope:
|
||||||
|
positions = self.mrope_positions[:, :num_input_tokens]
|
||||||
|
else:
|
||||||
|
positions = self.positions[:num_input_tokens]
|
||||||
|
|
||||||
# Run forward pass
|
# Run forward pass
|
||||||
with set_forward_context(attn_metadata,
|
with set_forward_context(attn_metadata,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
@@ -722,7 +982,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
positions=positions,
|
positions=positions,
|
||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
inputs_embeds=None,
|
inputs_embeds=inputs_embeds,
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -731,7 +991,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
positions=positions,
|
positions=positions,
|
||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
inputs_embeds=None,
|
inputs_embeds=inputs_embeds,
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1214,8 +1474,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def profile_run(self) -> None:
|
def profile_run(self) -> None:
|
||||||
# Profile with multimodal encoder & encoder cache.
|
# FIXME Profile with multimodal encoder & encoder cache.
|
||||||
self._profile_multimodal()
|
# current _profile_multimodal() using PyTorch SDPA backend method not
|
||||||
|
# support for window/full attn to reduce Memcpy operations, so will cause
|
||||||
|
# Out Of Memory problem, so we currently don't use self._profile_multimodal()
|
||||||
|
# self._profile_multimodal()
|
||||||
|
|
||||||
# For profile, have maximum num_reqs and that collectively have
|
# For profile, have maximum num_reqs and that collectively have
|
||||||
# maximum num_tokens.
|
# maximum num_tokens.
|
||||||
|
|||||||
Reference in New Issue
Block a user