This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,383 @@
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import PILImageResampling, load_image
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
from ...test_processing_common import url_to_local_path
if is_vision_available():
from PIL import Image
from transformers import Idefics3ImageProcessor
if is_torchvision_available():
from transformers import Idefics3ImageProcessorFast
if is_torch_available():
import torch
class Idefics3ImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_images=1,
image_size=18,
min_resolution=30,
max_resolution=40,
do_resize=True,
size=None,
max_image_size=None,
do_rescale=True,
rescale_factor=1 / 255,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_convert_rgb=True,
do_pad=True,
do_image_splitting=True,
resample=PILImageResampling.LANCZOS,
):
self.size = size if size is not None else {"longest_edge": max_resolution}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.num_images = num_images
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.resample = resample
self.do_image_splitting = do_image_splitting
self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 20}
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
self.do_pad = do_pad
def prepare_image_processor_dict(self):
return {
"do_convert_rgb": self.do_convert_rgb,
"do_resize": self.do_resize,
"size": self.size,
"max_image_size": self.max_image_size,
"do_rescale": self.do_rescale,
"rescale_factor": self.rescale_factor,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_pad": self.do_pad,
"do_image_splitting": self.do_image_splitting,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to Idefics3ImageProcessor,
assuming do_resize is set to True. The expected size in that case the max image size.
"""
return self.max_image_size["longest_edge"], self.max_image_size["longest_edge"]
def expected_output_image_shape(self, images):
height, width = self.get_expected_values(images, batched=True)
effective_nb_images = (
self.num_images * 5 if self.do_image_splitting else 1
) # 5 is a squared image divided into 4 + global image resized
return effective_nb_images, self.num_channels, height, width
def prepare_image_inputs(
self,
batch_size=None,
min_resolution=None,
max_resolution=None,
num_channels=None,
num_images=None,
size_divisor=None,
equal_resolution=False,
numpify=False,
torchify=False,
):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
batch_size = batch_size if batch_size is not None else self.batch_size
min_resolution = min_resolution if min_resolution is not None else self.min_resolution
max_resolution = max_resolution if max_resolution is not None else self.max_resolution
num_channels = num_channels if num_channels is not None else self.num_channels
num_images = num_images if num_images is not None else self.num_images
images_list = []
for i in range(batch_size):
images = []
for j in range(num_images):
if equal_resolution:
width = height = max_resolution
else:
# To avoid getting image width/height 0
if size_divisor is not None:
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
images_list.append(images)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
if torchify:
images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
if numpify:
# Numpy images are typically in channels last format
images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
return images_list
@require_torch
@require_vision
class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Idefics3ImageProcessor if is_vision_available() else None
fast_image_processing_class = Idefics3ImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = Idefics3ImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "resample"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
self.assertTrue(hasattr(image_processing, "max_image_size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_numpy_4_channels(self):
# Idefics3 always processes images as RGB, so it always returns images with 3 channels
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processor_dict = self.image_processor_dict
image_processing = image_processing_class(**image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
dummy_image = load_image(url_to_local_path("http://images.cocodataset.org/val2017/000000039769.jpg"))
dummy_image = dummy_image.resize((100, 150))
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(
equal_resolution=False, num_images=5, torchify=True
)
# pop some images to have non homogenous batches:
indices_to_pop = [i if np.random.random() < 0.5 else None for i in range(len(dummy_images))]
for i in indices_to_pop:
if i is not None:
dummy_images[i].pop()
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=3e-1)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
def test_get_num_patches_without_images(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=100, width=100, images_kwargs={}
)
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=300, width=500, images_kwargs={"do_image_splitting": False}
)
self.assertEqual(num_patches_and_row_cols, (1, 1, 1))
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=300, width=500, images_kwargs={"do_image_splitting": True}
)
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=300,
width=600,
images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}},
)
self.assertEqual(num_patches_and_row_cols, (3, 1, 2))

View File

@@ -0,0 +1,551 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Idefics3 model."""
import copy
import unittest
from io import BytesIO
import pytest
import requests
from transformers import (
AutoProcessor,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
cleanup,
require_bitsandbytes,
require_torch,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
from transformers import (
Idefics3Config,
Idefics3ForConditionalGeneration,
Idefics3Model,
)
if is_vision_available():
from PIL import Image
class Idefics3VisionText2TextModelTester:
def __init__(
self,
parent,
is_training=True,
batch_size=2,
scale_factor=2,
num_images=2,
vision_config={
"image_size": 16,
"patch_size": 4,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 32,
"dropout": 0.1,
"attention_dropout": 0.1,
"initializer_range": 0.02,
},
text_config={
"vocab_size": 100,
"hidden_size": 64,
"intermediate_size": 56,
"num_hidden_layers": 2,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"hidden_act": "silu",
"max_position_embeddings": 256,
"initializer_range": 0.02,
"rms_norm_eps": 1e-6,
"pad_token_id": 2,
"bos_token_id": 0,
"eos_token_id": 1,
"image_token_id": 57,
"tie_word_embeddings": False,
"rope_theta": 10000.0,
"sliding_window": 32,
"attention_dropout": 0.0,
},
use_cache=False,
tie_word_embeddings=False,
image_token_id=57,
):
self.parent = parent
self.pad_token_id = text_config["pad_token_id"]
self.is_training = is_training
self.batch_size = batch_size
self.num_images = num_images
self.scale_factor = scale_factor
self.seq_length = (
int(((vision_config["image_size"] // vision_config["patch_size"]) ** 2) / (self.scale_factor**2))
* self.num_images
)
self.use_cache = use_cache
self.image_token_id = image_token_id
self.tie_word_embeddings = tie_word_embeddings
# Hack - add properties here so use common tests
self.vocab_size = text_config["vocab_size"]
self.num_hidden_layers = text_config["num_hidden_layers"]
self.num_attention_heads = text_config["num_attention_heads"]
self.hidden_size = text_config["hidden_size"]
self.vision_config = vision_config
self.text_config = text_config
def get_config(self):
return Idefics3Config(
use_cache=self.use_cache,
image_token_id=self.image_token_id,
tie_word_embeddings=self.tie_word_embeddings,
vision_config=self.vision_config,
text_config=self.text_config,
vocab_size=self.vocab_size,
scale_factor=self.scale_factor,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
self.num_images,
3, # Idefics3ImageProcessor always generates RGB pixel values
self.vision_config["image_size"],
self.vision_config["image_size"],
]
)
config = self.get_config()
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
# For simplicity just set the last n tokens to the image token
n_image_tokens_per_batch = self.seq_length
input_ids[input_ids == self.image_token_id] = self.pad_token_id
input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id
attention_mask = input_ids.ne(1).to(torch_device)
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `Idefics3`.
"""
all_model_classes = (Idefics3Model,) if is_torch_available() else ()
fx_compatible = False
test_torchscript = False
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
def setUp(self):
self.model_tester = Idefics3VisionText2TextModelTester(self)
self.config_tester = ConfigTester(
self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"]
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
def test_inputs_embeds():
pass
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
def test_inputs_embeds_matches_input_ids(self):
pass
@unittest.skip(reason="Model does not support padding right")
def test_flash_attn_2_inference_padding_right(self):
pass
@unittest.skip(reason="Compile not yet supported in idefics3 models")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
# We need to override as we need to prepare such that the image token is the last token
def test_resize_tokens_embeddings(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
if self.model_tester.is_training is False:
model.eval()
model_vocab_size = config.text_config.vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Ignore copy
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
# make sure that decoder_input_ids are resized as well
if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
# Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
target_dimension = 128
model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0], target_dimension)
with self.assertRaisesRegex(
ValueError,
"Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# We need to override as we need to prepare such that the image token is the last token
def test_resize_embeddings_untied(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
original_config.tie_word_embeddings = False
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config).to(torch_device)
model.eval()
# if no output embeddings -> leave test
if model.get_output_embeddings() is None:
continue
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@require_torch
class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):
"""
Model tester for `Idefics3ForConditionalGeneration`.
"""
all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
test_torchscript = False
def setUp(self):
self.model_tester = Idefics3VisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False)
@unittest.skip(reason="input_embeds cannot be passed in without input_ids")
def test_inputs_embeds():
pass
@unittest.skip(reason="Model does not support padding right")
def test_flash_attn_2_inference_padding_right(self):
pass
@pytest.mark.generate
@slow
@unittest.skip(
reason="Idefics3 doesn't support SDPA for all backbones, vision backbones has only eager/FA2 attention"
)
def test_eager_matches_sdpa_generate(self):
pass
@unittest.skip(reason="Compile not yet supported in Idefics3 models end-to-end")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
# We need to override as we need to prepare such that the image token is the last token
def test_resize_tokens_embeddings(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
# Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
target_dimension = 128
model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0], target_dimension)
with self.assertRaisesRegex(
ValueError,
"Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# We need to override as we need to prepare such that the image token is the last token
def test_resize_embeddings_untied(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
original_config.tie_word_embeddings = False
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config).to(torch_device)
model.eval()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@require_torch
class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
self.image1 = Image.open(
BytesIO(
requests.get(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
).content
)
)
self.image2 = Image.open(
BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
)
self.image3 = Image.open(
BytesIO(
requests.get(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
).content
)
)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
@unittest.skip("multi-gpu tests are disabled for now")
def test_integration_test(self):
model = Idefics3ForConditionalGeneration.from_pretrained(
"HuggingFaceM4/Idefics3-8B-Llama3",
dtype=torch.bfloat16,
device_map="auto",
)
# Create inputs
text = "<image>In this image, we see"
images = self.image1
inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True)
inputs.to(torch_device)
generated_ids = model.generate(**inputs, max_new_tokens=10)
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
expected_generated_text = "<image>In this image, we see the Statue of Liberty, which is located on Liberty"
self.assertEqual(generated_texts[0], expected_generated_text)
@slow
@require_bitsandbytes
@unittest.skip("multi-gpu tests are disabled for now")
def test_integration_test_4bit(self):
# Let' s make sure we test the preprocessing to replace what is used
model = Idefics3ForConditionalGeneration.from_pretrained(
"HuggingFaceM4/Idefics3-8B-Llama3",
load_in_4bit=True,
device_map="auto",
)
# Create pixel inputs
text = ["<image>In this image, we see", "bla, bla <image><image>"]
images = [[self.image1], [self.image2, self.image3]]
inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt")
generated_ids = model.generate(**inputs, max_new_tokens=10)
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
expected_generated_text = "<image>In this image, we see the Statue of Liberty, trees, buildings, water"
self.assertEqual(generated_texts[0], expected_generated_text)

View File

@@ -0,0 +1,427 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
import numpy as np
from transformers import Idefics3Processor
from transformers.image_utils import load_image
from transformers.models.auto.processing_auto import AutoProcessor
from transformers.testing_utils import require_torch, require_vision
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
@require_torch
@require_vision
class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Idefics3Processor
@classmethod
def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp()
processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2)
processor.save_pretrained(cls.tmpdirname)
cls.image1 = load_image(
url_to_local_path(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
)
)
cls.image2 = load_image(
url_to_local_path("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
)
cls.image3 = load_image(
url_to_local_path(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
)
)
cls.bos_token = processor.tokenizer.bos_token
cls.image_token = processor.image_token
cls.fake_image_token = processor.fake_image_token
cls.global_img_token = processor.global_image_tag
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
cls.padding_token_id = processor.tokenizer.pad_token_id
cls.image_seq_len = processor.image_seq_len
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
@staticmethod
def prepare_processor_dict():
return {"image_seq_len": 2}
# Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens
def test_get_num_vision_tokens(self):
"Tests general functionality of the helper used internally in vLLM"
processor = self.get_processor()
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
self.assertTrue("num_image_tokens" in output)
self.assertEqual(len(output["num_image_tokens"]), 3)
self.assertTrue("num_image_patches" in output)
self.assertEqual(len(output["num_image_patches"]), 3)
def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
text_split_images = []
for n_h in range(image_rows):
for n_w in range(image_cols):
text_split_images += (
[self.fake_image_token_id]
+ processor.tokenizer(f"<row_{n_h + 1}_col_{n_w + 1}>", add_special_tokens=False)["input_ids"]
+ [self.image_token_id] * self.image_seq_len
)
text_split_images += processor.tokenizer("\n", add_special_tokens=False)["input_ids"]
text_split_images = text_split_images[:-1] # remove last newline
# add double newline, as it gets its own token
text_split_images += processor.tokenizer("\n\n", add_special_tokens=False)["input_ids"]
text_split_images += (
[self.fake_image_token_id]
+ self.global_img_tokens_id
+ [self.image_token_id] * self.image_seq_len
+ [self.fake_image_token_id]
)
return text_split_images
@classmethod
def tearDownClass(cls):
cls.image1.close()
cls.image2.close()
cls.image3.close()
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
def test_process_interleaved_images_prompts_no_image_splitting(self):
processor = self.get_processor()
processor.image_processor.do_image_splitting = False
# Test that a single image is processed correctly
inputs = processor(images=self.image1)
image1_expected_size = (364, 364)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
# fmt: on
# Test a single sample with image and text
image_str = "<image>"
text_str = "In this image, we see"
text = image_str + text_str
inputs = processor(text=text, images=self.image1)
# fmt: off
tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
# fmt: on
# Test that batch is correctly processed
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "In this image, we see"
text = [
image_str + text_str_1,
image_str + image_str + text_str_2,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = processor(text=text, images=images, padding=True)
# fmt: off
tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
image_tokens = [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
expected_input_ids_1 = [self.bos_token_id] + image_tokens + tokenized_sentence_1["input_ids"]
expected_input_ids_2 = [self.bos_token_id] + 2 * image_tokens + tokenized_sentence_2["input_ids"]
# Pad the first input to match the second input
pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
self.assertEqual(
inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
)
self.assertEqual(
inputs["attention_mask"],
[[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
)
self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 2, 3, 364, 364))
self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 2, 364, 364))
# fmt: on
def test_process_interleaved_images_prompts_image_splitting(self):
processor = self.get_processor()
processor.image_processor.do_image_splitting = True
# Test that a single image is processed correctly
inputs = processor(images=self.image1)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 364, 364))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 364, 364))
# fmt: on
self.maxDiff = None
# Test a single sample with image and text
image_str = "<image>"
text_str = "In this image, we see"
text = image_str + text_str
inputs = processor(text=text, images=self.image1)
# fmt: off
tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
expected_input_ids_1 = [[self.bos_token_id] + split_image1_tokens + tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids_1)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids_1[0])])
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 364, 364))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 364, 364))
# fmt: on
# Test that batch is correctly processed
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "bla, bla"
text = [
image_str + text_str_1,
text_str_2 + image_str + image_str,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = processor(text=text, images=images, padding=True)
# fmt: off
tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
split_image2_tokens = self.get_split_image_expected_tokens(processor, 4, 4)
split_image3_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
expected_input_ids_1 = [self.bos_token_id] + split_image1_tokens + tokenized_sentence_1["input_ids"]
expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + split_image2_tokens + split_image3_tokens
# Pad the first input to match the second input
pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
self.assertEqual(
inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
)
self.assertEqual(
inputs["attention_mask"],
[[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
)
self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 30, 3, 364, 364))
self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 30, 364, 364))
# fmt: on
def test_add_special_tokens_processor(self):
processor = self.get_processor()
image_str = "<image>"
text_str = "In this image, we see"
text = text_str + image_str
# fmt: off
inputs = processor(text=text, images=self.image1, add_special_tokens=False)
tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
expected_input_ids = [tokenized_sentence["input_ids"] + split_image1_tokens]
self.assertEqual(inputs["input_ids"], expected_input_ids)
inputs = processor(text=text, images=self.image1)
expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + split_image1_tokens]
self.assertEqual(inputs["input_ids"], expected_input_ids)
# fmt: on
def test_non_nested_images_with_batched_text(self):
processor = self.get_processor()
processor.image_processor.do_image_splitting = False
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "In this image, we see"
text = [
image_str + text_str_1,
image_str + image_str + text_str_2,
]
images = [self.image1, self.image2, self.image3]
inputs = processor(text=text, images=images, padding=True)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 2, 3, 364, 364))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (2, 2, 364, 364))
# Copied from tests.models.idefics2.test_processing_idefics2.Idefics2ProcessorTest.test_process_interleaved_images_prompts_image_error
def test_process_interleaved_images_prompts_image_error(self):
processor = self.get_processor()
text = [
"This is a test sentence.",
"In this other sentence we try some good things",
]
images = [[self.image1], [self.image2]]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [[self.image1], []]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
text = [
"This is a test sentence.<image>",
"In this other sentence we try some good things<image>",
]
images = [[self.image1], [self.image2, self.image3]]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [[], [self.image2]]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [self.image1, self.image2, self.image3]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [self.image1]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
text = [
"This is a test sentence.",
"In this other sentence we try some good things<image>",
]
images = [[self.image1], []]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [self.image1, self.image2]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
def test_apply_chat_template(self):
# Message contains content which a mix of lists with images and image urls and string
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What do these images show?"},
{"type": "image"},
{"type": "image"},
"What do these images show?",
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.",
}
],
},
{"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
]
processor = self.get_processor()
# Make short sequence length to test that the fake tokens are added correctly
rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
expected_rendered = (
"<|begin_of_text|>User: What do these images show?<image><image><end_of_utterance>\n"
"Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<end_of_utterance>\n"
"User: And who is that?<end_of_utterance>\n"
"Assistant:"
)
self.assertEqual(rendered, expected_rendered)
@require_torch
@require_vision
def test_text_only_inference(self):
"""Test that the processor works correctly with text-only input."""
processor = self.get_processor()
text = "This is a simple text without images."
inputs = processor(text=text)
tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
self.assertTrue("pixel_values" not in inputs)
self.assertTrue("pixel_attention_mask" not in inputs)
# Test batch of texts without image tokens
texts = ["First text.", "Second piece of text."]
batch_inputs = processor(text=texts, padding=True)
tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
expected_1 = [self.bos_token_id] + tokenized_1["input_ids"]
expected_2 = [self.bos_token_id] + tokenized_2["input_ids"]
# Pad the shorter sequence
pad_len = len(expected_2) - len(expected_1)
if pad_len > 0:
padded_expected_1 = [self.padding_token_id] * pad_len + expected_1
expected_attention_1 = [0] * pad_len + [1] * len(expected_1)
self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2])
self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)])
else:
pad_len = -pad_len
padded_expected_2 = [self.padding_token_id] * pad_len + expected_2
expected_attention_2 = [0] * pad_len + [1] * len(expected_2)
self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2])
self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2])
@require_torch
@require_vision
def test_missing_images_error(self):
"""Test that appropriate error is raised when images are referenced but not provided."""
processor = self.get_processor()
# Test single text with image token but no image
text = "Let me show you this image: <image> What do you think?"
with self.assertRaises(ValueError) as context:
processor(text=text)
self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
# Test batch with image tokens but no images
texts = [
"First text with <image> token.",
"Second text <image> with token.",
]
with self.assertRaises(ValueError) as context:
processor(text=texts)
self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
# Test with None as Images
with self.assertRaises(ValueError) as context:
processor(text=text, images=None)
self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
with self.assertRaises(ValueError) as context:
processor(text=texts, images=None)
self.assertTrue("tokens in the text but no images were passed" in str(context.exception))