85 lines
2.7 KiB
Python
85 lines
2.7 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
|
|
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
|
||
|
|
"""BAGEL processor for image and text inputs."""
|
||
|
|
|
||
|
|
from transformers import AutoProcessor
|
||
|
|
from transformers.feature_extraction_utils import BatchFeature
|
||
|
|
from transformers.image_utils import ImageInput
|
||
|
|
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||
|
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||
|
|
|
||
|
|
|
||
|
|
class BagelProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||
|
|
_defaults = {
|
||
|
|
"images_kwargs": {
|
||
|
|
"return_tensors": "pt",
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class BagelProcessor(ProcessorMixin):
|
||
|
|
"""
|
||
|
|
Constructs a BAGEL processor which wraps a
|
||
|
|
SigLIP image processor and a Qwen2 tokenizer.
|
||
|
|
"""
|
||
|
|
|
||
|
|
attributes = ["image_processor", "tokenizer"]
|
||
|
|
image_processor_class = "SiglipImageProcessor"
|
||
|
|
tokenizer_class = "AutoTokenizer"
|
||
|
|
|
||
|
|
def __call__(
|
||
|
|
self,
|
||
|
|
text: TextInput
|
||
|
|
| PreTokenizedInput
|
||
|
|
| list[TextInput]
|
||
|
|
| list[PreTokenizedInput] = None,
|
||
|
|
images: ImageInput = None,
|
||
|
|
**kwargs: Unpack[BagelProcessorKwargs],
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Main method to prepare for the model one or several sequences(s) and image(s).
|
||
|
|
"""
|
||
|
|
output_kwargs = self._merge_kwargs(
|
||
|
|
BagelProcessorKwargs,
|
||
|
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||
|
|
**kwargs,
|
||
|
|
)
|
||
|
|
|
||
|
|
if images is not None:
|
||
|
|
# Process images with the image processor
|
||
|
|
pixel_values = self.image_processor(
|
||
|
|
images, **output_kwargs["images_kwargs"]
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
pixel_values = {}
|
||
|
|
|
||
|
|
text_inputs = (
|
||
|
|
self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||
|
|
if text is not None
|
||
|
|
else {}
|
||
|
|
)
|
||
|
|
|
||
|
|
return BatchFeature(data={**pixel_values, **text_inputs})
|
||
|
|
|
||
|
|
def batch_decode(self, *args, **kwargs):
|
||
|
|
"""
|
||
|
|
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
|
||
|
|
"""
|
||
|
|
return self.tokenizer.batch_decode(*args, **kwargs)
|
||
|
|
|
||
|
|
def decode(self, *args, **kwargs):
|
||
|
|
"""
|
||
|
|
This method forwards all its arguments to Qwen2TokenizerFast's decode.
|
||
|
|
"""
|
||
|
|
return self.tokenizer.decode(*args, **kwargs)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def model_input_names(self):
|
||
|
|
tokenizer_input_names = self.tokenizer.model_input_names
|
||
|
|
image_processor_input_names = self.image_processor.model_input_names
|
||
|
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||
|
|
|
||
|
|
|
||
|
|
AutoProcessor.register("BagelProcessor", BagelProcessor)
|