[ModelRunner][MultiModal] Remove legacy input mapper/processor from V0 (#951)

### What this PR does / why we need it?
Remove legacy input mapper/processor from V0.

Find more details at
https://github.com/vllm-project/vllm-ascend/issues/673 and
https://github.com/vllm-project/vllm/pull/15686.

### Does this PR introduce _any_ user-facing change?
no.

### How was this patch tested?
Launch online service:

```bash
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--max_model_len 32768 \
--max-num-batched-tokens 32768
```

Query the server:

```bash
curl http://localhost:8000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
    "model": "Qwen/Qwen2.5-VL-7B-Instruct",
    "messages": [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": [
        {"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png"}},
        {"type": "text", "text": "What is the text in the illustrate?"}
    ]}
    ]
    }'
```

Result:

```bash
{"id":"chatcmpl-619e70733ed148b3be3a0b6524ee0ef3","object":"chat.completion","created":1748226332,"model":"/home/sss/.cache/modelscope/hub/models/Qwen/Qwen2___5-VL-7B-Instruct","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The text in the illustration reads \"TONGYI Qwen.\"","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"pro
```

Signed-off-by: shen-shanshan <467638484@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
Shanshan Shen
2025-06-03 11:32:03 +08:00
committed by GitHub
parent 6ec64a3f96
commit 93860574bb

View File

@@ -391,7 +391,6 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
self.sliding_window = self.runner.sliding_window
self.block_size = self.runner.block_size
self.enable_lora = self.runner.lora_config is not None
self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
self.finished_requests_ids = finished_requests_ids
self.decode_only = True
self.is_encoder_decoder = self.runner.model_config.is_encoder_decoder
@@ -786,23 +785,15 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
seq_group_metadata: SequenceGroupMetadata):
"""If multi-modal data is given, add it to the input."""
# NOTE: mm_data only includes the subset of multi-modal items that
# NOTE: mm_kwargs only includes the subset of multi-modal items that
# intersect with the current prefill positions.
positions = inter_data.input_positions[0]
mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
seq_group_metadata,
range(positions[0], positions[0] + len(positions)))
if not mm_data:
if not mm_kwargs:
return
if self.runner.mm_registry.has_processor(self.runner.model_config):
mm_kwargs = mm_data
else:
mm_kwargs = self.multi_modal_input_mapper(
mm_data,
seq_group_metadata.mm_processor_kwargs,
)
inter_data.multi_modal_kwargs = mm_kwargs
inter_data.multi_modal_placeholder_maps = placeholder_maps
@@ -918,9 +909,6 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
# Multi-modal data support
self.input_registry = input_registry
self.mm_registry = mm_registry
self.multi_modal_input_mapper = mm_registry \
.create_input_mapper(model_config)
self.mm_registry.init_mm_limits_per_prompt(self.model_config)
# Lazy initialization
self.model: nn.Module # Set after load_model
@@ -1116,8 +1104,8 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
dummy_data = self.input_registry \
.dummy_data_for_profiling(self.model_config,
seq_len,
self.mm_registry)
seq_len,
self.mm_registry)
seq = SequenceGroupMetadata(
request_id=str(group_id),