74 lines
2.5 KiB
Python
74 lines
2.5 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
|
|
"""BAGEL processor for image and text inputs."""
|
|
|
|
from transformers import AutoProcessor
|
|
from transformers.image_utils import ImageInput
|
|
from transformers.processing_utils import ProcessorMixin
|
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
|
|
|
|
|
class BagelProcessor(ProcessorMixin):
|
|
"""
|
|
Constructs a BAGEL processor which wraps a
|
|
SigLIP image processor and a Qwen2 tokenizer.
|
|
"""
|
|
|
|
attributes = ["image_processor", "tokenizer"]
|
|
image_processor_class = "SiglipImageProcessor"
|
|
tokenizer_class = "AutoTokenizer"
|
|
|
|
def __call__(
|
|
self,
|
|
text: TextInput
|
|
| PreTokenizedInput
|
|
| list[TextInput]
|
|
| list[PreTokenizedInput] = None,
|
|
images: ImageInput = None,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
Main method to prepare for the model one or several sequences(s) and image(s).
|
|
"""
|
|
if images is not None:
|
|
# Process images with the image processor
|
|
# Ensure return_tensors is set to "pt" for PyTorch tensors
|
|
image_kwargs = {**kwargs}
|
|
if "return_tensors" not in image_kwargs:
|
|
image_kwargs["return_tensors"] = "pt"
|
|
pixel_values = self.image_processor(images, **image_kwargs)
|
|
else:
|
|
pixel_values = None
|
|
|
|
text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
|
|
|
|
if pixel_values is not None and text_inputs is not None:
|
|
text_inputs["pixel_values"] = pixel_values["pixel_values"]
|
|
return text_inputs
|
|
elif pixel_values is not None:
|
|
return pixel_values
|
|
else:
|
|
return text_inputs
|
|
|
|
def batch_decode(self, *args, **kwargs):
|
|
"""
|
|
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
|
|
"""
|
|
return self.tokenizer.batch_decode(*args, **kwargs)
|
|
|
|
def decode(self, *args, **kwargs):
|
|
"""
|
|
This method forwards all its arguments to Qwen2TokenizerFast's decode.
|
|
"""
|
|
return self.tokenizer.decode(*args, **kwargs)
|
|
|
|
@property
|
|
def model_input_names(self):
|
|
tokenizer_input_names = self.tokenizer.model_input_names
|
|
image_processor_input_names = self.image_processor.model_input_names
|
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
|
|
|
|
|
AutoProcessor.register("BagelProcessor", BagelProcessor)
|