This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,383 @@
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import PILImageResampling, load_image
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
from ...test_processing_common import url_to_local_path
if is_vision_available():
from PIL import Image
from transformers import SmolVLMImageProcessor
if is_torchvision_available():
from transformers import SmolVLMImageProcessorFast
if is_torch_available():
import torch
class SmolVLMImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_images=1,
image_size=18,
min_resolution=30,
max_resolution=40,
do_resize=True,
size=None,
max_image_size=None,
do_rescale=True,
rescale_factor=1 / 255,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_convert_rgb=True,
do_pad=True,
do_image_splitting=True,
resample=PILImageResampling.LANCZOS,
):
self.size = size if size is not None else {"longest_edge": max_resolution}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.num_images = num_images
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.resample = resample
self.do_image_splitting = do_image_splitting
self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 20}
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
self.do_pad = do_pad
def prepare_image_processor_dict(self):
return {
"do_convert_rgb": self.do_convert_rgb,
"do_resize": self.do_resize,
"size": self.size,
"max_image_size": self.max_image_size,
"do_rescale": self.do_rescale,
"rescale_factor": self.rescale_factor,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_pad": self.do_pad,
"do_image_splitting": self.do_image_splitting,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to SmolVLMImageProcessor,
assuming do_resize is set to True. The expected size in that case the max image size.
"""
return self.max_image_size["longest_edge"], self.max_image_size["longest_edge"]
def expected_output_image_shape(self, images):
height, width = self.get_expected_values(images, batched=True)
effective_nb_images = (
self.num_images * 5 if self.do_image_splitting else 1
) # 5 is a squared image divided into 4 + global image resized
return effective_nb_images, self.num_channels, height, width
def prepare_image_inputs(
self,
batch_size=None,
min_resolution=None,
max_resolution=None,
num_channels=None,
num_images=None,
size_divisor=None,
equal_resolution=False,
numpify=False,
torchify=False,
):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
batch_size = batch_size if batch_size is not None else self.batch_size
min_resolution = min_resolution if min_resolution is not None else self.min_resolution
max_resolution = max_resolution if max_resolution is not None else self.max_resolution
num_channels = num_channels if num_channels is not None else self.num_channels
num_images = num_images if num_images is not None else self.num_images
images_list = []
for i in range(batch_size):
images = []
for j in range(num_images):
if equal_resolution:
width = height = max_resolution
else:
# To avoid getting image width/height 0
if size_divisor is not None:
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
images_list.append(images)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
if torchify:
images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
if numpify:
# Numpy images are typically in channels last format
images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
return images_list
@require_torch
@require_vision
class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = SmolVLMImageProcessor if is_vision_available() else None
fast_image_processing_class = SmolVLMImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = SmolVLMImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "resample"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
self.assertTrue(hasattr(image_processing, "max_image_size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "do_image_splitting"))
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_numpy_4_channels(self):
# SmolVLM always processes images as RGB, so it always returns images with 3 channels
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processor_dict = self.image_processor_dict
image_processing = image_processing_class(**image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
dummy_image = load_image(url_to_local_path("http://images.cocodataset.org/val2017/000000039769.jpg"))
dummy_image = dummy_image.resize((100, 150))
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(
equal_resolution=False, num_images=5, torchify=True
)
# pop some images to have non homogenous batches:
indices_to_pop = [i if np.random.random() < 0.5 else None for i in range(len(dummy_images))]
for i in indices_to_pop:
if i is not None:
dummy_images[i].pop()
image_processor_slow = self.image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
image_processor_fast = self.fast_image_processing_class(
**self.image_processor_dict, resample=PILImageResampling.BICUBIC
)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt", return_row_col_info=True)
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt", return_row_col_info=True)
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=3e-1)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.pixel_attention_mask.float(), encoding_fast.pixel_attention_mask.float()
)
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
def test_get_num_patches_without_images(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=100, width=100, images_kwargs={}
)
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=300, width=500, images_kwargs={"do_image_splitting": False}
)
self.assertEqual(num_patches_and_row_cols, (1, 1, 1))
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=300, width=500, images_kwargs={"do_image_splitting": True}
)
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
height=300,
width=600,
images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}},
)
self.assertEqual(num_patches_and_row_cols, (3, 1, 2))

View File

@@ -0,0 +1,680 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch SmolVLM model."""
import copy
import unittest
from io import BytesIO
import pytest
import requests
from parameterized import parameterized
from transformers import (
AutoProcessor,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
Expectations,
cleanup,
is_flaky,
require_torch,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
from transformers import (
GenerationConfig,
SmolVLMConfig,
SmolVLMForConditionalGeneration,
SmolVLMModel,
)
if is_vision_available():
from PIL import Image
class SmolVLMVisionText2TextModelTester:
def __init__(
self,
parent,
is_training=True,
batch_size=2,
scale_factor=2,
num_images=2,
vision_config={
"image_size": 16,
"patch_size": 4,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 32,
"dropout": 0.1,
"attention_dropout": 0.1,
"initializer_range": 0.02,
},
text_config={
"vocab_size": 100,
"hidden_size": 64,
"intermediate_size": 56,
"num_hidden_layers": 2,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"hidden_act": "silu",
"max_position_embeddings": 256,
"initializer_range": 0.02,
"rms_norm_eps": 1e-6,
"pad_token_id": 2,
"bos_token_id": 0,
"eos_token_id": 1,
"image_token_id": 57,
"tie_word_embeddings": False,
"rope_theta": 10000.0,
"sliding_window": 32,
"attention_dropout": 0.0,
},
use_cache=False,
tie_word_embeddings=False,
image_token_id=57,
):
self.parent = parent
self.is_training = is_training
self.batch_size = batch_size
self.num_images = num_images
self.scale_factor = scale_factor
self.seq_length = (
int(((vision_config["image_size"] // vision_config["patch_size"]) ** 2) / (self.scale_factor**2))
* self.num_images
)
self.use_cache = use_cache
self.image_token_id = image_token_id
self.tie_word_embeddings = tie_word_embeddings
# Hack - add properties here so use common tests
self.vocab_size = text_config["vocab_size"]
self.num_hidden_layers = text_config["num_hidden_layers"]
self.num_attention_heads = text_config["num_attention_heads"]
self.hidden_size = text_config["hidden_size"]
self.vision_config = vision_config
self.text_config = text_config
def get_config(self):
return SmolVLMConfig(
use_cache=self.use_cache,
image_token_id=self.image_token_id,
tie_word_embeddings=self.tie_word_embeddings,
vision_config=self.vision_config,
text_config=self.text_config,
vocab_size=self.vocab_size,
scale_factor=self.scale_factor,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
self.num_images,
3, # SmolVLMImageProcessor always generates RGB pixel values
self.vision_config["image_size"],
self.vision_config["image_size"],
]
)
config = self.get_config()
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
# For simplicity just set the last n tokens to the image token
n_image_tokens_per_batch = self.seq_length
input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id
attention_mask = input_ids.ne(1).to(torch_device)
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class SmolVLMModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `SmolVLM`.
"""
all_model_classes = (SmolVLMModel,) if is_torch_available() else ()
fx_compatible = False
test_torchscript = False
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
def setUp(self):
self.model_tester = SmolVLMVisionText2TextModelTester(self)
self.config_tester = ConfigTester(
self, config_class=SmolVLMConfig, has_text_modality=False, common_properties=["image_token_id"]
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="Model does not support padding right")
def test_flash_attn_2_inference_padding_right(self):
pass
@unittest.skip(reason="Compile not yet supported in SmolVLM models")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Compile not yet supported in SmolVLM models")
def test_sdpa_can_dispatch_on_flash(self):
pass
# We need to override as we need to prepare such that the image token is the last token
def test_resize_tokens_embeddings(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
if self.model_tester.is_training is False:
model.eval()
model_vocab_size = config.text_config.vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Ignore copy
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
# make sure that decoder_input_ids are resized as well
if "decoder_input_ids" in inputs_dict:
inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
# Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
target_dimension = 128
model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0], target_dimension)
with self.assertRaisesRegex(
ValueError,
"Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# We need to override as we need to prepare such that the image token is the last token
def test_resize_embeddings_untied(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
original_config.tie_word_embeddings = False
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config).to(torch_device)
model.eval()
# if no output embeddings -> leave test
if model.get_output_embeddings() is None:
continue
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@require_torch
class SmolVLMForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):
"""
Model tester for `SmolVLMForConditionalGeneration`.
"""
all_model_classes = (SmolVLMForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (SmolVLMForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": SmolVLMForConditionalGeneration} if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
test_torchscript = False
def setUp(self):
self.model_tester = SmolVLMVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=SmolVLMConfig, has_text_modality=False)
@unittest.skip(reason="Model does not support padding right")
def test_flash_attn_2_inference_padding_right(self):
pass
@pytest.mark.generate
@is_flaky(description="TODO: check why flaky")
def test_generate_methods_with_logits_to_keep(self):
super().test_generate_methods_with_logits_to_keep()
@unittest.skip
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant(self):
pass
@unittest.skip(
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="Unsupported")
def test_generate_with_static_cache(self):
pass
@unittest.skip(reason="Compile not yet supported in SmolVLM models")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Compile not yet supported in SmolVLM models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@pytest.mark.generate
@slow
@unittest.skip(
reason="SmolVLM doesn't support SDPA for all backbones, vision backbones has only eager/FA2 attention"
)
def test_eager_matches_sdpa_generate(self):
pass
@parameterized.expand([("random",), ("same",)])
@pytest.mark.generate
@unittest.skip(reason="Cache position is off by one leaving out image tokens, FIXME raushan")
def test_assisted_decoding_matches_greedy_search(self, assistant_type):
pass
# We need to override as we need to prepare such that the image token is the last token
def test_resize_tokens_embeddings(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
# Retrieve the embeddings and clone theme
model_embed = model.resize_token_embeddings(model_vocab_size)
cloned_embeddings = model_embed.weight.clone()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
config = copy.deepcopy(original_config)
model = model_class(config)
model.to(torch_device)
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0] // 64, 0)
# Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
target_dimension = 128
model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
self.assertTrue(model_embed.weight.shape[0], target_dimension)
with self.assertRaisesRegex(
ValueError,
"Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
):
model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
# We need to override as we need to prepare such that the image token is the last token
def test_resize_embeddings_untied(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
original_config.tie_word_embeddings = False
for model_class in self.all_model_classes:
config = copy.deepcopy(original_config)
model = model_class(config).to(torch_device)
model.eval()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_vocab_size = config.text_config.vocab_size
model.resize_token_embeddings(model_vocab_size + 10)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model.resize_token_embeddings(model_vocab_size - 15)
self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
# Check that it actually resizes the embeddings matrix
output_embeds = model.get_output_embeddings()
self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
# Check bias if present
if output_embeds.bias is not None:
self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
n_images = self.model_tester.num_images * self.model_tester.seq_length
model.model.image_token_id = model_vocab_size - 15 - 1
inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@require_torch
class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
self.image1 = Image.open(
BytesIO(
requests.get(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
).content
)
)
self.video_messages = [
{
"role": "user",
"content": [
{
"type": "video",
"path": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov",
},
{"type": "text", "text": "Describe this video in detail"},
],
},
]
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
def test_integration_test(self):
model = SmolVLMForConditionalGeneration.from_pretrained(
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
dtype=torch.bfloat16,
device_map="auto",
)
# Create inputs
text = "<image>In this image, we see"
images = self.image1
inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True)
inputs.to(device=torch_device, dtype=torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=9)
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
self.assertEqual(generated_texts[0], expected_generated_text)
@slow
def test_integration_test_video(self):
model = SmolVLMForConditionalGeneration.from_pretrained(
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
dtype=torch.bfloat16,
device_map="auto",
)
# Create inputs
inputs = self.processor.apply_chat_template(
self.video_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(device=torch_device, dtype=torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=20)
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
expected_generated_text = Expectations(
{
(None, None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
("cuda", (8, 0)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed',
("cuda", (8, 6)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model, specifically a neural network model, which is designed to learn and',
("rocm", (9, 4)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
("rocm", None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed',
}
).get_expectation() # fmt: skip
self.assertEqual(generated_texts[0], expected_generated_text)
@slow
def test_export_smolvlm_vision_encoder(self):
from transformers import AutoConfig
from transformers.integrations.executorch import TorchExportableModuleForVLM
model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
# NOTE: The attention_mask is prepared internally in the vision encoder, depending on whether flash attention is used or not
# For ExecuTorch, flash attention is not supported, so the way of exporting vison encoder should be compatible with text-decoder
config = AutoConfig.from_pretrained(model_id)
config.text_config._flash_attn_2_enabled = False
# Load model and extract vision encoder
model = SmolVLMForConditionalGeneration.from_pretrained(
model_id,
dtype=torch.float32,
config=config,
)
exportable_module = TorchExportableModuleForVLM(model)
exported_program = exportable_module.export_vision_encoder()
self.assertIsInstance(exported_program, torch.export.ExportedProgram)
@slow
def test_export_smolvlm_connector(self):
from transformers import AutoConfig
from transformers.integrations.executorch import TorchExportableModuleForVLM
model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
# NOTE: The attention_mask is prepared internally in the vision encoder, depending on whether flash attention is used or not
# For ExecuTorch, flash attention is not supported, so the way of exporting vison encoder should be compatible with text-decoder
config = AutoConfig.from_pretrained(model_id)
config.text_config._flash_attn_2_enabled = False
# Load the model and extract the connector (multi-modal projector)
model = SmolVLMForConditionalGeneration.from_pretrained(
model_id,
dtype=torch.float32,
config=config,
)
connector = model.model.connector
connector.eval()
exportable_module = TorchExportableModuleForVLM(model)
exported_program = exportable_module.export_connector()
self.assertIsInstance(exported_program, torch.export.ExportedProgram)
@slow
def test_export_smolvlm_text_decoder(self):
from transformers import AutoConfig
from transformers.integrations.executorch import TorchExportableModuleForVLM
model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
# NOTE: The attention_mask is prepared internally in the vision encoder, depending on whether flash attention is used or not
# For ExecuTorch, flash attention is not supported, so the way of exporting vison encoder should be compatible with text-decoder
config = AutoConfig.from_pretrained(model_id)
config.text_config._flash_attn_2_enabled = False
config.text_config.use_cache = True
config.text_config.attn_implementation = "sdpa"
generation_config = GenerationConfig(
use_cache=True,
cache_implementation="static",
max_length=1234,
cache_config={
"batch_size": 1,
"max_cache_len": 1234,
},
)
# Load the model and extract the text decoder
model = SmolVLMForConditionalGeneration.from_pretrained(
model_id,
dtype=torch.float32,
config=config,
)
model.model.text_model.generation_config = generation_config
text_decoder = model.model.text_model
text_decoder.eval()
exportable_module = TorchExportableModuleForVLM(model)
exported_program = exportable_module.export_text_decoder()
self.assertIsInstance(exported_program, torch.export.ExportedProgram)

View File

@@ -0,0 +1,597 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
from typing import Optional
import numpy as np
from transformers import SmolVLMProcessor
from transformers.image_utils import load_image
from transformers.models.auto.processing_auto import AutoProcessor
from transformers.testing_utils import require_av, require_torch, require_vision
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
@require_torch
@require_vision
class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = SmolVLMProcessor
videos_input_name = "pixel_values"
@classmethod
def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp()
processor_kwargs = cls.prepare_processor_dict()
processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", **processor_kwargs)
processor.save_pretrained(cls.tmpdirname)
cls.image1 = load_image(
url_to_local_path(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
)
)
cls.image2 = load_image(
url_to_local_path(
url_to_local_path("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
)
)
cls.image3 = load_image(
url_to_local_path(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
)
)
cls.bos_token = processor.tokenizer.bos_token
cls.image_token = processor.image_token
cls.video_token = processor.video_token
cls.fake_image_token = processor.fake_image_token
cls.global_img_token = processor.global_image_token
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
cls.padding_token_id = processor.tokenizer.pad_token_id
cls.image_seq_len = processor.image_seq_len
@classmethod
def tearDownClass(cls):
cls.image1.close()
cls.image2.close()
cls.image3.close()
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def get_video_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
@staticmethod
def prepare_processor_dict():
return {
"image_seq_len": 2,
"chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
}
# Override as SmolVLM needs images/video to be an explicitly nested batch
def prepare_image_inputs(self, batch_size: Optional[int] = None):
"""This function prepares a list of PIL images for testing"""
images = super().prepare_image_inputs(batch_size)
if isinstance(images, (list, tuple)):
images = [[image] for image in images]
return images
def prepare_video_inputs(self, batch_size: Optional[int] = None):
"""This function prepares a list of numpy videos."""
video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
if batch_size is None:
return [[video_input]]
return [[video_input]] * batch_size
def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
text_split_images = []
for n_h in range(image_rows):
for n_w in range(image_cols):
text_split_images += (
[self.fake_image_token_id]
+ processor.tokenizer(f"<row_{n_h + 1}_col_{n_w + 1}>", add_special_tokens=False)["input_ids"]
+ [self.image_token_id] * self.image_seq_len
)
text_split_images += processor.tokenizer("\n", add_special_tokens=False)["input_ids"]
text_split_images = text_split_images[:-1] # remove last newline
# add double newline, as it gets its own token
text_split_images += processor.tokenizer("\n\n", add_special_tokens=False)["input_ids"]
text_split_images += (
[self.fake_image_token_id]
+ self.global_img_tokens_id
+ [self.image_token_id] * self.image_seq_len
+ [self.fake_image_token_id]
)
return text_split_images
def test_process_interleaved_images_prompts_no_image_splitting(self):
processor_components = self.prepare_components()
processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
processor_components["image_processor"] = self.get_component("image_processor", do_image_splitting=False)
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
# Test that a single image is processed correctly
inputs = processor(images=self.image1)
image1_expected_size = (512, 512)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
# fmt: on
# Test a single sample with image and text
image_str = "<image>"
text_str = "In this image, we see"
text = image_str + text_str
inputs = processor(text=text, images=self.image1)
# fmt: off
tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
expected_input_ids = [[self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
# fmt: on
# Test that batch is correctly processed
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "In this image, we see"
text = [
image_str + text_str_1,
image_str + image_str + text_str_2,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = processor(text=text, images=images, padding=True)
# fmt: off
tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
image_tokens = [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
expected_input_ids_1 = image_tokens + tokenized_sentence_1["input_ids"]
expected_input_ids_2 = 2 * image_tokens + tokenized_sentence_2["input_ids"]
# Pad the first input to match the second input
pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
self.assertEqual(
inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
)
self.assertEqual(
inputs["attention_mask"],
[[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
)
self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 2, 3, 512, 512))
self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 2, 512, 512))
# fmt: on
def test_process_interleaved_images_prompts_image_splitting(self):
processor_components = self.prepare_components()
processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
processor_components["image_processor"] = self.get_component("image_processor", do_image_splitting=True)
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
# Test that a single image is processed correctly
inputs = processor(images=self.image1)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 512, 512))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 512, 512))
# fmt: on
self.maxDiff = None
# Test a single sample with image and text
image_str = "<image>"
text_str = "In this image, we see"
text = image_str + text_str
inputs = processor(text=text, images=self.image1)
# fmt: off
tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
expected_input_ids_1 = [split_image1_tokens + tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids_1)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids_1[0])])
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 512, 512))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 512, 512))
# fmt: on
# Test that batch is correctly processed
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "bla, bla"
text = [
image_str + text_str_1,
text_str_2 + image_str + image_str,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = processor(text=text, images=images, padding=True)
# fmt: off
tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
split_image2_tokens = self.get_split_image_expected_tokens(processor, 4, 4)
split_image3_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
expected_input_ids_1 = split_image1_tokens + tokenized_sentence_1["input_ids"]
expected_input_ids_2 = tokenized_sentence_2["input_ids"] + split_image2_tokens + split_image3_tokens
# Pad the first input to match the second input
pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
self.assertEqual(
inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
)
self.assertEqual(
inputs["attention_mask"],
[[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
)
self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 30, 3, 512, 512))
self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 30, 512, 512))
# fmt: on
def test_add_special_tokens_processor(self):
processor = self.get_processor()
image_str = "<image>"
text_str = "In this image, we see"
text = text_str + image_str
# fmt: off
inputs = processor(text=text, images=self.image1, add_special_tokens=False)
tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
expected_input_ids = [tokenized_sentence["input_ids"] + split_image1_tokens]
self.assertEqual(inputs["input_ids"], expected_input_ids)
inputs = processor(text=text, images=self.image1)
expected_input_ids = [tokenized_sentence["input_ids"] + split_image1_tokens]
self.assertEqual(inputs["input_ids"], expected_input_ids)
# fmt: on
@unittest.skip(reason="from @molbap @zucchini-nlp, passing non-nested images is error-prone and not recommended")
def test_non_nested_images_with_batched_text(self):
processor = self.get_processor()
processor.image_processor.do_image_splitting = False
image_str = "<image>"
text_str_1 = "In this image, we see"
text_str_2 = "In this image, we see"
text = [
image_str + text_str_1,
image_str + image_str + text_str_2,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = processor(text=text, images=images, padding=True)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 2, 3, 512, 512))
self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (2, 2, 512, 512))
# Copied from tests.models.idefics2.test_processing_idefics2.Idefics2ProcessorTest.test_process_interleaved_images_prompts_image_error
def test_process_interleaved_images_prompts_image_error(self):
processor = self.get_processor()
text = [
"This is a test sentence.",
"In this other sentence we try some good things",
]
images = [[self.image1], [self.image2]]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [[self.image1], []]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
text = [
"This is a test sentence.<image>",
"In this other sentence we try some good things<image>",
]
images = [[self.image1], [self.image2, self.image3]]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [[], [self.image2]]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [self.image1, self.image2, self.image3]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [self.image1]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
text = [
"This is a test sentence.",
"In this other sentence we try some good things<image>",
]
images = [[self.image1], []]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
images = [self.image1, self.image2]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
def test_apply_chat_template(self):
# Message contains content which a mix of lists with images and image urls and string
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What do these images show?"},
{"type": "image"},
{"type": "image"},
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.",
}
],
},
{"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
]
processor = self.get_processor()
# Make short sequence length to test that the fake tokens are added correctly
rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
expected_rendered = (
"<|im_start|>User: What do these images show?<image><image><end_of_utterance>\n"
"Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<end_of_utterance>\n"
"User: And who is that?<end_of_utterance>\n"
"Assistant:"
)
self.assertEqual(rendered, expected_rendered)
@require_av
@require_torch
def test_apply_chat_template_video_frame_sampling(self):
# overridden because SmolVLM has special preprocessing for videos
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
messages = [
[
{
"role": "user",
"content": [
{
"type": "video",
"url": url_to_local_path(
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
),
},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
num_frames = 3
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
num_frames=num_frames,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
# SmolVLM doesn't sample `num_frames` exactly, by uses other sampling method
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 1)
# Load with `fps` arg
fps = 10
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
# SmolVLM doesn't sample 1 frame per second exactly, by uses other sampling method
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 4)
# NOTE: the last assert checks are removed
# Loading video as a list of frames (i.e. images) is not supported in SmolVLM
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer")
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor, **processor_kwargs
)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="max_length",
max_length=76,
truncation=True,
max_image_size={"longest_edge": 300},
)
self.assertEqual(inputs["pixel_values"].shape[2], 3)
self.assertEqual(inputs["pixel_values"].shape[3], 300)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs_batched_video(self):
if "video_processor" not in self.processor_class.attributes:
self.skipTest(f"video_processor attribute not present in {self.processor_class}")
processor_components = self.prepare_components()
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2, modalities="video")
video_input = self.prepare_video_inputs(batch_size=2)
inputs = processor(
text=input_str,
videos=video_input,
return_tensors="pt",
do_rescale=True,
rescale_factor=-1,
padding="max_length",
max_length=172,
)
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
self.assertEqual(len(inputs["input_ids"][0]), 172)
@require_torch
@require_vision
def test_text_only_inference(self):
"""Test that the processor works correctly with text-only input."""
processor_components = self.prepare_components()
processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
text = "This is a simple text without images."
inputs = processor(text=text)
tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
expected_input_ids = [tokenized_sentence["input_ids"]]
self.assertEqual(inputs["input_ids"], expected_input_ids)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
self.assertTrue("pixel_values" not in inputs)
self.assertTrue("pixel_attention_mask" not in inputs)
# Test batch of texts without image tokens
texts = ["First text.", "Second piece of text."]
batch_inputs = processor(text=texts, padding=True)
tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
expected_1 = tokenized_1["input_ids"]
expected_2 = tokenized_2["input_ids"]
# Pad the shorter sequence
pad_len = len(expected_2) - len(expected_1)
if pad_len > 0:
padded_expected_1 = [self.padding_token_id] * pad_len + expected_1
expected_attention_1 = [0] * pad_len + [1] * len(expected_1)
self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2])
self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)])
else:
pad_len = -pad_len
padded_expected_2 = [self.padding_token_id] * pad_len + expected_2
expected_attention_2 = [0] * pad_len + [1] * len(expected_2)
self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2])
self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2])
@require_torch
@require_vision
def test_missing_images_error(self):
"""Test that appropriate error is raised when images are referenced but not provided."""
processor = self.get_processor()
# Test single text with image token but no image
text = "Let me show you this image: <image> What do you think?"
with self.assertRaises(ValueError) as context:
processor(text=text)
self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
# Test batch with image tokens but no images
texts = [
"First text with <image> token.",
"Second text <image> with token.",
]
with self.assertRaises(ValueError) as context:
processor(text=texts)
self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
# Test with None as Images
with self.assertRaises(ValueError) as context:
processor(text=text, images=None)
self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
with self.assertRaises(ValueError) as context:
processor(text=texts, images=None)
self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=20,
)
@unittest.skip(
"SmolVLM cannot accept list of decoded video frames, because it needs to know video fps and duration"
)
def test_apply_chat_template_decoded_video_0(self):
pass

View File

@@ -0,0 +1,149 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torchvision_available, is_vision_available
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
if is_vision_available():
if is_torchvision_available():
from transformers import SmolVLMVideoProcessor
class SmolVLMVideoProcessingTester:
def __init__(
self,
parent,
batch_size=5,
num_frames=8,
num_channels=3,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_normalize=True,
image_mean=IMAGENET_STANDARD_MEAN,
image_std=IMAGENET_STANDARD_STD,
do_convert_rgb=True,
):
size = size if size is not None else {"longest_edge": 20}
self.parent = parent
self.batch_size = batch_size
self.num_frames = num_frames
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.max_image_size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_video_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
"max_image_size": self.max_image_size,
}
def expected_output_video_shape(self, videos):
return [
self.num_frames,
self.num_channels,
self.max_image_size["longest_edge"],
self.max_image_size["longest_edge"],
]
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
videos = prepare_video_inputs(
batch_size=self.batch_size,
num_frames=self.num_frames,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
return_tensors=return_tensors,
)
return videos
@require_torch
@require_vision
class SmolVLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
fast_video_processing_class = SmolVLMVideoProcessor if is_torchvision_available() else None
input_name = "pixel_values"
def setUp(self):
super().setUp()
self.video_processor_tester = SmolVLMVideoProcessingTester(self)
@property
def video_processor_dict(self):
return self.video_processor_tester.prepare_video_processor_dict()
def test_video_processor_from_dict_with_kwargs(self):
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
self.assertEqual(video_processor.size, {"longest_edge": 20})
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
# overwrite, SmolVLM requires to have metadata no matter how we sample
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 8)
self.assertEqual(encoded_videos_batched.shape[1], 8)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(
video_inputs[0], return_tensors="pt", num_frames=6, fps=3, video_metadata=metadata
)[self.input_name]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", num_frames=6, fps=3, video_metadata=batched_metadata
)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 6)
self.assertEqual(encoded_videos_batched.shape[1], 6)
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames