This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,383 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
if is_vision_available():
from PIL import Image
from transformers import MllamaImageProcessor
if is_torch_available():
import torch
class MllamaImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
num_images=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_rescale=True,
rescale_factor=1 / 255,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_convert_rgb=True,
do_pad=True,
max_image_tiles=4,
):
size = size if size is not None else {"height": 224, "width": 224}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.max_image_tiles = max_image_tiles
self.image_size = image_size
self.num_images = num_images
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_convert_rgb = do_convert_rgb
self.do_pad = do_pad
def prepare_image_processor_dict(self):
return {
"do_convert_rgb": self.do_convert_rgb,
"do_resize": self.do_resize,
"size": self.size,
"do_rescale": self.do_rescale,
"rescale_factor": self.rescale_factor,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_pad": self.do_pad,
"max_image_tiles": self.max_image_tiles,
}
def prepare_image_inputs(
self,
batch_size=None,
min_resolution=None,
max_resolution=None,
num_channels=None,
num_images=None,
size_divisor=None,
equal_resolution=False,
numpify=False,
torchify=False,
):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
batch_size = batch_size if batch_size is not None else self.batch_size
min_resolution = min_resolution if min_resolution is not None else self.min_resolution
max_resolution = max_resolution if max_resolution is not None else self.max_resolution
num_channels = num_channels if num_channels is not None else self.num_channels
num_images = num_images if num_images is not None else self.num_images
images_list = []
for i in range(batch_size):
images = []
for j in range(num_images):
if equal_resolution:
width = height = max_resolution
else:
# To avoid getting image width/height 0
if size_divisor is not None:
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
images_list.append(images)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
if torchify:
images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
return images_list
def expected_output_image_shape(self, images):
expected_output_image_shape = (
max(len(images) for images in images),
self.max_image_tiles,
self.num_channels,
self.size["height"],
self.size["width"],
)
return expected_output_image_shape
@require_torch
@require_vision
class MllamaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = MllamaImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = MllamaImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "max_image_tiles"))
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
expected_output_image_shape = (
max(len(images) for images in image_inputs),
self.image_processor_tester.max_image_tiles,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_channels_last(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# a white 1x1 pixel RGB image
image_inputs = [[np.full(shape=(1, 1, 3), fill_value=1.0, dtype=float)]]
encoded_images = image_processing(
image_inputs, return_tensors="pt", input_data_format="channels_last"
).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
def test_ambiguous_channel_pil_image(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
image_inputs = [[Image.new("RGB", (1, 1))], [Image.new("RGB", (100, 1))]]
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
def test_resize_impractical_aspect_ratio(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# Ensure that no error is raised even if the aspect ratio is impractical
image_inputs = [[Image.new("RGB", (9999999, 1))]]
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
def test_call_numpy_4_channels(self):
self.skipTest("4 channels input is not supported yet")
def test_image_correctly_tiled(self):
def get_empty_tiles(pixel_values):
# image has shape batch_size, max_num_images, max_image_tiles, num_channels, height, width
# we want to get a binary mask of shape batch_size, max_num_images, max_image_tiles
# of empty tiles, i.e. tiles that are completely zero
return np.all(pixel_values == 0, axis=(3, 4, 5))
image_processor_dict = {**self.image_processor_dict, "size": {"height": 50, "width": 50}, "max_image_tiles": 4}
image_processor = self.image_processing_class(**image_processor_dict)
# image fits 2x2 tiles grid (width x height)
image = Image.new("RGB", (80, 95))
inputs = image_processor(image, return_tensors="np")
pixel_values = inputs.pixel_values
empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
self.assertEqual(empty_tiles, [False, False, False, False])
aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
self.assertEqual(aspect_ratio_ids, 6)
aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
self.assertEqual(aspect_ratio_mask, [1, 1, 1, 1])
# image fits 3x1 grid (width x height)
image = Image.new("RGB", (101, 50))
inputs = image_processor(image, return_tensors="np")
pixel_values = inputs.pixel_values
empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
self.assertEqual(empty_tiles, [False, False, False, True])
aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
self.assertEqual(aspect_ratio_ids, 3)
num_tiles = inputs.aspect_ratio_mask[0, 0].sum()
self.assertEqual(num_tiles, 3)
aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
self.assertEqual(aspect_ratio_mask, [1, 1, 1, 0])
# image fits 1x1 grid (width x height)
image = Image.new("RGB", (20, 39))
inputs = image_processor(image, return_tensors="np")
pixel_values = inputs.pixel_values
empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
self.assertEqual(empty_tiles, [False, True, True, True])
aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
self.assertEqual(aspect_ratio_ids, 1)
aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
self.assertEqual(aspect_ratio_mask, [1, 0, 0, 0])
# image fits 2x1 grid (width x height)
image = Image.new("RGB", (51, 20))
inputs = image_processor(image, return_tensors="np")
pixel_values = inputs.pixel_values
empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
self.assertEqual(empty_tiles, [False, False, True, True])
aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
self.assertEqual(aspect_ratio_ids, 2)
aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
self.assertEqual(aspect_ratio_mask, [1, 1, 0, 0])
# image is greater than 2x2 tiles grid (width x height)
image = Image.new("RGB", (150, 150))
inputs = image_processor(image, return_tensors="np")
pixel_values = inputs.pixel_values
empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
self.assertEqual(empty_tiles, [False, False, False, False])
aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
self.assertEqual(aspect_ratio_ids, 6) # (2 - 1) * 4 + 2 = 6
aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
self.assertEqual(aspect_ratio_mask, [1, 1, 1, 1])
# batch of images
image1 = Image.new("RGB", (80, 95))
image2 = Image.new("RGB", (101, 50))
image3 = Image.new("RGB", (23, 49))
inputs = image_processor([[image1], [image2, image3]], return_tensors="np")
pixel_values = inputs.pixel_values
empty_tiles = get_empty_tiles(pixel_values).tolist()
expected_empty_tiles = [
# sample 1 with 1 image 2x2 grid
[
[False, False, False, False],
[True, True, True, True], # padding
],
# sample 2
[
[False, False, False, True], # 3x1
[False, True, True, True], # 1x1
],
]
self.assertEqual(empty_tiles, expected_empty_tiles)
aspect_ratio_ids = inputs.aspect_ratio_ids.tolist()
expected_aspect_ratio_ids = [[6, 0], [3, 1]]
self.assertEqual(aspect_ratio_ids, expected_aspect_ratio_ids)
aspect_ratio_mask = inputs.aspect_ratio_mask.tolist()
expected_aspect_ratio_mask = [
[
[1, 1, 1, 1],
[1, 0, 0, 0],
],
[
[1, 1, 1, 0],
[1, 0, 0, 0],
],
]
self.assertEqual(aspect_ratio_mask, expected_aspect_ratio_mask)

View File

@@ -0,0 +1,805 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Mllama model."""
import unittest
import pytest
import requests
from transformers import (
AutoProcessor,
BitsAndBytesConfig,
MllamaConfig,
MllamaForCausalLM,
MllamaForConditionalGeneration,
MllamaModel,
is_torch_available,
is_vision_available,
)
from transformers.cache_utils import Cache
from transformers.models.mllama.configuration_mllama import MllamaTextConfig
from transformers.testing_utils import (
Expectations,
cleanup,
require_bitsandbytes,
require_optimum_quanto,
require_read_token,
require_torch,
require_torch_accelerator,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class MllamaText2TextModelTester:
def __init__(
self,
parent,
ignore_index=-100,
seq_length=7,
is_training=True,
text_config={
"model_type": "mllama",
"vocab_size": 99,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 4,
"intermediate_size": 37,
"hidden_act": "gelu",
"max_position_embeddings": 512,
"initializer_range": 0.02,
"rope_scaling": {"rope_type": "default"},
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
},
):
self.parent = parent
self.ignore_index = ignore_index
self.text_config = text_config
self.seq_length = seq_length
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.is_training = is_training
self.pad_token_id = self.text_config["pad_token_id"]
self.batch_size = 3
def get_config(self):
return MllamaTextConfig(**self.text_config)
def prepare_config_and_inputs(self):
config = self.get_config()
input_ids = ids_tensor([self.batch_size, self.seq_length], config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
return config, input_ids, attention_mask
def prepare_config_and_inputs_for_common(self):
config, input_ids, attention_mask = self.prepare_config_and_inputs()
inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
return config, inputs_dict
def create_and_check_mllama_model_fp16_forward(self, config, input_ids, attention_mask):
model = MllamaForCausalLM(config=config)
model.to(torch_device)
model.eval()
with torch.autocast(device_type="cuda", dtype=torch.float16):
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
@require_torch
class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `MllamaForConditionalGeneration`.
"""
all_model_classes = (MllamaForCausalLM,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
def setUp(self):
self.model_tester = MllamaText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True)
class MllamaVisionText2TextModelTester:
def __init__(
self,
parent,
ignore_index=-100,
image_token_index=4,
seq_length=7,
is_training=True,
text_config={
"model_type": "mllama",
"vocab_size": 99,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 4,
"intermediate_size": 37,
"hidden_act": "gelu",
"max_position_embeddings": 512,
"initializer_range": 0.02,
"rope_scaling": {"rope_type": "default"},
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
"cross_attention_layers": [1],
},
vision_config={
"image_size": 30,
"patch_size": 2,
"num_channels": 3,
"hidden_size": 16,
"intermediate_layers_indices": [0],
"vision_output_dim": 32,
"projection_dim": 32,
"num_hidden_layers": 2,
"num_global_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 37,
"dropout": 0.1,
"initializer_range": 0.02,
"supported_aspect_ratios": [[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [3, 1], [4, 1]],
},
):
self.parent = parent
self.is_training = is_training
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.text_config = text_config
self.vision_config = vision_config
self.seq_length = seq_length
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.pad_token_id = self.text_config["pad_token_id"]
self.batch_size = 3
self.num_channels = 3
self.image_size = 224
self.max_num_images = 1
self.max_image_tiles = 4
self.image_length = 904
def get_config(self):
return MllamaConfig(
text_config=self.text_config,
vision_config=self.vision_config,
image_token_index=self.image_token_index,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
self.max_num_images,
self.max_image_tiles,
self.vision_config["num_channels"],
self.vision_config["image_size"],
self.vision_config["image_size"],
]
)
aspect_ratio_ids = torch.tensor([[6] * self.batch_size], device=torch_device).transpose(0, 1)
aspect_ratio_mask = torch.ones(self.batch_size, self.max_num_images, self.max_image_tiles)
config = self.get_config()
return config, pixel_values, aspect_ratio_ids, aspect_ratio_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, aspect_ratio_ids, aspect_ratio_mask = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
aspect_ratio_mask = aspect_ratio_mask.to(torch_device)
cross_attention_mask = torch.ones(
(self.batch_size, self.seq_length, self.max_num_images, self.max_image_tiles), device=torch_device
)
input_ids[input_ids == config.image_token_index] = self.pad_token_id
input_ids[:, 1] = config.image_token_index
inputs_dict = {
"pixel_values": pixel_values,
"aspect_ratio_ids": aspect_ratio_ids,
"input_ids": input_ids,
"attention_mask": attention_mask,
"aspect_ratio_mask": aspect_ratio_mask,
"cross_attention_mask": cross_attention_mask,
"use_cache": True,
}
return config, inputs_dict
def create_and_check_mllama_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
model = MllamaForConditionalGeneration(config=config)
model.to(torch_device)
model.eval()
with torch.autocast(device_type="cuda", dtype=torch.float16):
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
pixel_values=pixel_values.to(torch.bfloat16),
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
@unittest.skip("Mllama applies key/query norm which doesn't work with packing")
def test_eager_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("Mllama applies key/query norm which doesn't work with packing")
def test_sdpa_padding_matches_padding_free_with_position_ids(self):
pass
@require_torch
class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `MllamaForConditionalGeneration`.
"""
all_model_classes = (
(
MllamaModel,
MllamaForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
test_pruning = False
test_head_masking = False
test_torchscript = False
_is_composite = True
def setUp(self):
self.model_tester = MllamaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(
self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"]
)
def test_config(self):
self.config_tester.run_common_tests()
def test_resize_embeddings_results_in_successful_loss(self):
# resizing embeddings should result in successful loss computation
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
model = MllamaForConditionalGeneration(config).to(torch_device)
model_vocab_size = config.get_text_config().vocab_size
inputs = self._prepare_for_class(inputs, MllamaForConditionalGeneration, return_labels=True)
# Resize embeddings and call forward
model.resize_token_embeddings(model_vocab_size + 10)
output = model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
labels=inputs["labels"],
return_dict=True,
)
self.assertTrue("loss" in output)
def _check_attentions_for_generate(
self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values
):
# Mllama has cross attention layers and those have a different shape than normal attention layers
self.assertIsInstance(attentions, tuple)
self.assertListEqual(
[isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
)
self.assertEqual(len(attentions), (output_length - prompt_length))
cross_attention_layers = self.model_tester.text_config["cross_attention_layers"]
use_cache = decoder_past_key_values is not None
for generated_length, iter_attentions in enumerate(attentions):
# regardless of using cache, the first forward pass will have the full prompt as input
if use_cache and generated_length > 0:
model_input_length = 1
else:
model_input_length = prompt_length + generated_length
query_length = prompt_length + generated_length
expected_shape = (
batch_size,
config.num_attention_heads,
model_input_length,
query_length,
)
expected_shape_cross = (
batch_size,
config.num_attention_heads,
model_input_length,
self.model_tester.image_length,
)
expected_shapes = [
expected_shape if layer_idx not in cross_attention_layers else expected_shape_cross
for layer_idx in range(len(iter_attentions))
]
self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
@require_optimum_quanto
@pytest.mark.generate
@unittest.skip("Mllama is actually an encoder decoder cache and thus can't supports quant cache")
def test_generate_with_quant_cache(self):
pass
@unittest.skip("For some unknown reasons the tests fails in CrossAttention layer when doing torch.sdpa(). ")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="AssertionError: Items in the second set but not the first: might be a setting issue")
def test_model_parallelism(self):
pass
@unittest.skip(reason="Mllama can't assisted decoding due to cache format and `Cache.crop()`")
def test_assisted_decoding_with_num_logits_to_keep(self):
pass
@unittest.skip(reason="Mllama uses self.weights directly causing device mismatch when offloading`")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Mllama uses self.weights directly causing device mismatch when offloading`")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Mllama uses self.weights directly causing device mismatch when offloading`")
def test_disk_offload_safetensors(self):
pass
@unittest.skip("Mllama applies key/query norm which doesn't work with packing")
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("Mllama applies key/query norm which doesn't work with packing")
def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self):
pass
@unittest.skip("Mllama applies key/query norm which doesn't work with packing")
def test_eager_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("Mllama applies key/query norm which doesn't work with packing")
def test_sdpa_padding_matches_padding_free_with_position_ids(self):
pass
@pytest.mark.generate
# overridden because mllama is not an encoder-decoder model, but has encoder-decoder-like cache
def test_past_key_values_format(self):
# Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
# standard KV cache format is important for a consistent API (and for advanced generation methods).
for model_class in self.all_generative_model_classes:
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config).to(torch_device)
if "use_cache" not in inputs:
inputs["use_cache"] = True
outputs = model(**inputs)
text_config = config.get_text_config()
num_hidden_layers = (
getattr(text_config, "decoder_layers", None)
or getattr(text_config, "num_decoder_layers", None)
or text_config.num_hidden_layers
)
num_attention_heads = getattr(text_config, "decoder_attention_heads", text_config.num_attention_heads)
embed_dim = getattr(text_config, "d_model", text_config.hidden_size)
per_head_embed_dim = embed_dim // num_attention_heads
# some models have different num-head for query vs key/value so we need to assign correct value
# BUT only after `per_head_embed_dim` is set
num_attention_heads = (
text_config.num_key_value_heads
if getattr(text_config, "num_key_value_heads", None) is not None
else num_attention_heads
)
past_kv = outputs["past_key_values"]
self.assertEqual(len(past_kv), num_hidden_layers)
batch_size, seq_length = inputs["input_ids"].shape
for i in range(num_hidden_layers):
self.assertEqual(len(past_kv[0]), 2) # K V for the decoder = 2
if i in self.model_tester.text_config["cross_attention_layers"]:
self.assertEqual(
past_kv[i][0].shape,
(batch_size, num_attention_heads, self.model_tester.image_length, per_head_embed_dim),
)
self.assertEqual(
past_kv[i][1].shape,
(batch_size, num_attention_heads, self.model_tester.image_length, per_head_embed_dim),
)
else:
self.assertEqual(
past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
)
self.assertEqual(
past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
)
# overridden because mllama has special cache for self and cross attentions
def _check_past_key_values_for_generate(self, batch_size, decoder_past_key_values, cache_length, config):
self.assertIsInstance(decoder_past_key_values, Cache)
self.assertListEqual(
[isinstance(iter_past_key_values, tuple) for iter_past_key_values in decoder_past_key_values],
[True] * len(decoder_past_key_values),
)
for layer_idx, layer_past_key_values in enumerate(decoder_past_key_values):
if layer_idx in self.model_tester.text_config["cross_attention_layers"]:
expected_shape = (
batch_size,
config.num_key_value_heads
if hasattr(config, "num_key_value_heads")
else config.num_attention_heads,
self.model_tester.image_length,
config.hidden_size // config.num_attention_heads,
)
else:
# (batch, head, cache_length, head_features)
expected_shape = (
batch_size,
config.num_key_value_heads
if hasattr(config, "num_key_value_heads")
else config.num_attention_heads,
cache_length,
config.hidden_size // config.num_attention_heads,
)
# check shape key, value
self.assertListEqual([layer_past_key_values[0].shape], [expected_shape])
self.assertListEqual([layer_past_key_values[1].shape], [expected_shape])
def test_generate_text_only_with_cache(self):
"""
Tests that our cached generation with text-only inputs works. When mllama was introduced, this feature
required cache modifications (because layers are skipped in practice). This test should prevent regressions.
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
model.generate(input_ids, use_cache=True)
@pytest.mark.generate
def test_left_padding_compatibility(self):
# Overwrite -- mllama needs to prepare `cross_attention_mask`, and it must be padded accordingly
_, inputs_dict = self.prepare_config_and_inputs_for_generate()
input_ids = inputs_dict["input_ids"]
cross_attention_mask = inputs_dict["cross_attention_mask"]
pad_cross_attn_size = (input_ids.shape[0], 32, *cross_attention_mask.shape[2:])
extra_cross_attn_mask = torch.zeros(pad_cross_attn_size, dtype=cross_attention_mask.dtype, device=torch_device)
padded_cross_attention_mask = torch.cat([extra_cross_attn_mask, cross_attention_mask], dim=1)
# `cross_attention_mask` is randomly generated in `prepare_config_and_inputs_for_generate`, and it must match
# its padded version for the test to be valid -- we need to pass both
unpadded_custom_inputs = {"cross_attention_mask": cross_attention_mask}
padded_custom_inputs = {"cross_attention_mask": padded_cross_attention_mask}
super().test_left_padding_compatibility(
unpadded_custom_inputs=unpadded_custom_inputs, padded_custom_inputs=padded_custom_inputs
)
@require_torch
class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.base_model_checkpoint = "meta-llama/Llama-3.2-11B-Vision"
self.instruct_model_checkpoint = "meta-llama/Llama-3.2-11B-Vision-Instruct"
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
@require_torch_accelerator
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_generate(self):
# Prepare inputs
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
prompt = "<|image|>If I had to write a haiku for this one"
url = "https://llava-vl.github.io/static/images/view.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch_device)
input_ids = inputs["input_ids"]
# Check inputs ids
expected_input_ids_all = Expectations(
{
("xpu", 3): torch.tensor([[128000, 128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
("cuda", 7): torch.tensor([[128000, 128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
("cuda", 8): torch.tensor([[128000, 128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
}
) # fmt: skip
expected_input_ids = expected_input_ids_all.get_expectation()
self.assertTrue(torch.equal(input_ids, expected_input_ids))
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
# Generate
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_outputs = Expectations(
{
("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nA dock in the lake.\\nA mountain in the distance.\\nA long exposure.",
("cuda", 8): 'If I had to write a haiku for this one, it would be:.\\nA dock in the lake.\\nA mountain in the distance.\\nA long exposure.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
@slow
@require_torch_accelerator
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_generate_text_only(self):
# Prepare inputs
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
prompt = "If I had to write a haiku"
inputs = processor(text=prompt, return_tensors="pt").to(torch_device)
input_ids = inputs["input_ids"].cpu().squeeze().tolist()
# Check inputs ids
expected_input_ids_all = Expectations(
{
("xpu", 3): [128000, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
("cuda", 7): [128000, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
("cuda", 8): [128000, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
}
)
expected_input_ids = expected_input_ids_all.get_expectation()
self.assertEqual(input_ids, expected_input_ids)
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
# Generate
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_outputs = Expectations(
{
("xpu", 3): "If I had to write a haiku about my life, I would write:\nLife is a messy tapestry\n Threads of joy and sorrow\nWeft of memories",
("cuda", 7): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
("cuda", 8): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
@slow
@require_torch_accelerator
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_forward(self):
# Prepare inputs
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
prompt = "<|image|>If I had to write a haiku for this one"
url = "https://llava-vl.github.io/static/images/view.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch_device)
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
# Forward
with torch.inference_mode():
output = model(**inputs)
actual_logits = output.logits[0, -1, :5].cpu()
expected_logits_all = Expectations(
{
("xpu", 3): torch.tensor([9.1562, 8.9141, 5.0664, 1.6855, 3.2324], dtype=actual_logits.dtype),
("cuda", 7): torch.tensor([9.0781, 8.8750, 5.0781, 1.6221, 3.2207], dtype=actual_logits.dtype),
("cuda", 8): torch.tensor([9.0703, 8.8750, 5.0781, 1.6279, 3.2207], dtype=actual_logits.dtype),
}
)
expected_logits = expected_logits_all.get_expectation()
self.assertTrue(
torch.allclose(actual_logits, expected_logits, atol=0.1),
f"Actual logits: {actual_logits}"
f"\nExpected logits: {expected_logits}"
f"\nDifference: {torch.abs(actual_logits - expected_logits)}",
)
@slow
@require_torch_accelerator
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_batched_generate(self):
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
# Prepare inputs
prompt = [
"<|image|>If I had to write a haiku for this one",
"<|image|>This image shows",
]
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(
requests.get(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
stream=True,
).raw
)
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
torch_device
)
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
# Check first output
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_outputs = Expectations(
{
("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
("cuda", 8): 'If I had to write a haiku for this one, it would be:.\\nA dock in the lake.\\nA mountain in the distance.\\nA long exposure.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
# Check second output
decoded_output = processor.decode(output[1], skip_special_tokens=True)
expected_outputs = Expectations(
{
("xpu", 3): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
("cuda", 7): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
("cuda", 8): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
@slow
@require_torch_accelerator
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_multi_image_generate(self):
processor = AutoProcessor.from_pretrained(self.instruct_model_checkpoint)
# Prepare inputs
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(
requests.get(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
stream=True,
).raw
)
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Whats shown in this image?"},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "This image shows a long wooden dock extending out into a lake."}
],
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What about this one, what do you see here? Can you describe in detail?"},
],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=prompt, images=[[image1, image2]], return_tensors="pt").to(torch_device)
prompt_len = inputs["input_ids"].shape[-1]
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.instruct_model_checkpoint, quantization_config=quantization_config
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
# Check first output
generated_output = output[0][prompt_len:]
decoded_output = processor.decode(generated_output, skip_special_tokens=False)
# model should response about "stop sign", however it responses about "dock"
# this happens only in quantized version, bfloat16 works fine
expected_output = "This image shows a long wooden dock extending out into a lake. The dock is made of wooden planks and has a railing"
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)

View File

@@ -0,0 +1,427 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import shutil
import tempfile
import unittest
from typing import Optional
import numpy as np
from transformers import MllamaProcessor
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_vision_available():
from PIL import Image
@require_torch
@require_vision
class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = MllamaProcessor
@classmethod
def setUpClass(cls):
cls.checkpoint = "hf-internal-testing/mllama-11b"
processor = MllamaProcessor.from_pretrained(cls.checkpoint)
cls.image1 = Image.new("RGB", (224, 220))
cls.image2 = Image.new("RGB", (512, 128))
cls.image_token = processor.image_token
cls.image_token_id = processor.image_token_id
cls.pad_token_id = processor.tokenizer.pad_token_id
cls.bos_token = processor.bos_token
cls.bos_token_id = processor.tokenizer.bos_token_id
cls.tmpdirname = tempfile.mkdtemp()
processor.save_pretrained(cls.tmpdirname)
@classmethod
def tearDownClass(cls):
cls.image1.close()
cls.image2.close()
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
def prepare_processor_dict(self):
return {"chat_template": "{% for message in messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{ '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"} # fmt: skip
# Override as Mllama needs images to be an explicitly nested batch
def prepare_image_inputs(self, batch_size: Optional[int] = None):
"""This function prepares a list of PIL images for testing"""
images = super().prepare_image_inputs(batch_size)
if isinstance(images, (list, tuple)):
images = [[image] for image in images]
return images
def test_chat_template_is_saved(self):
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
processor_dict_loaded = json.loads(processor_loaded.to_json_string())
# chat templates aren't serialized to json in processors
self.assertFalse("chat_template" in processor_dict_loaded)
# they have to be saved as separate file and loaded back from that file
# so we check if the same template is loaded
processor_dict = self.prepare_processor_dict()
self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
def test_apply_chat_template(self):
# Message contains content which a mix of lists with images and image urls and string
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "image"},
{"type": "text", "text": "What do these images show?"},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "The first image shows the statue of Liberty in New York."},
],
},
{
"role": "user",
"content": [
{"type": "text", "text": "And who is that?"},
],
},
]
processor = MllamaProcessor.from_pretrained(self.tmpdirname)
rendered = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
expected_rendered = (
"<|begin_of_text|>"
"<|start_header_id|>user<|end_header_id|>\n\n"
"<|image|><|image|>What do these images show?"
"<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n"
"The first image shows the statue of Liberty in New York."
"<|eot_id|>"
"<|start_header_id|>user<|end_header_id|>\n\n"
"And who is that?"
"<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n"
)
self.assertEqual(rendered, expected_rendered)
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": "This is a test sentence."},
],
},
{
"role": "user",
"content": [
{"type": "text", "text": "This is a response."},
],
},
]
input_ids = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
expected_ids = [
[
128000, # <|begin_of_text|>
128006, # <|start_header_id|>
9125, # "system"
128007, # <|end_of_header|>
271, # "\n\n"
2028,
374,
264,
1296,
11914,
13, # "This is a test sentence."
128009, # <|eot_id|>
128006, # <|start_header_id|>
882, # "user"
128007, # <|end_of_header|>
271, # "\n\n"
2028,
374,
264,
2077,
13, # "This is a response.",
128009, # <|eot_id|>
128006, # <|start_header_id|>
78191, # "assistant"
128007, # <|end_of_header|>
271, # "\n\n"
]
]
self.assertEqual(input_ids, expected_ids)
# test image in multiple locations
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in two sentences"},
{
"type": "image",
"url": url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
},
{"type": "text", "text": " Test sentence "},
{
"type": "image",
"url": url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
},
{"type": "text", "text": "ok\n"},
],
}
]
rendered = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
expected_rendered = (
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
"Describe this image in two sentences<|image|> Test sentence <|image|>ok\n<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n"
)
self.assertEqual(rendered, expected_rendered)
input_ids = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
# fmt: off
expected_ids = [[
128000, 128006, 882, 128007, 271, 75885, 420, 2217, 304, 1403, 23719, 128256,
3475, 11914, 262, 128256, 564, 198, 128009, 128006, 78191, 128007, 271,
]]
# fmt: on
self.assertEqual(input_ids, expected_ids)
# text format for content
messages_list = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Describe this image in two sentences"},
],
}
]
messages_str = [
{
"role": "user",
"content": "<|image|>Describe this image in two sentences",
}
]
rendered_list = processor.apply_chat_template(messages_list, add_generation_prompt=True, tokenize=False)
rendered_str = processor.apply_chat_template(messages_str, add_generation_prompt=True, tokenize=False)
self.assertEqual(rendered_list, rendered_str)
def test_process_interleaved_images_prompts_image_splitting(self):
processor = MllamaProcessor.from_pretrained(self.tmpdirname)
# Test that a single image is processed correctly
inputs = processor(images=self.image2, size={"width": 224, "height": 224})
self.assertEqual(inputs["pixel_values"].shape, (1, 1, 4, 3, 224, 224))
# Test that text is processed correctly
text = "<|begin_of_text|>This is a test sentence.<|end_of_text|>"
inputs = processor(text=text)
expected_ids = [128000, 2028, 374, 264, 1296, 11914, 13, 128001]
self.assertEqual(inputs["input_ids"][0], expected_ids)
self.assertEqual(inputs["attention_mask"][0], [1] * len(expected_ids))
self.assertEqual(inputs.get("cross_attention_mask"), None)
# Test a single sample with image and text
image_str = "<|image|>"
text_str = "This is a test sentence."
text = image_str + text_str
inputs = processor(
text=text,
images=self.image1,
size={"width": 128, "height": 128},
)
expected_ids = [self.image_token_id, self.bos_token_id] + [2028, 374, 264, 1296, 11914, 13]
self.assertEqual(inputs["pixel_values"].shape, (1, 1, 4, 3, 128, 128))
self.assertEqual(inputs["input_ids"][0], expected_ids)
self.assertEqual(inputs["attention_mask"][0], [1] * len(expected_ids))
cross_attention_mask = inputs["cross_attention_mask"]
self.assertEqual(cross_attention_mask.shape, (1, 8, 1, 4))
self.assertTrue(
np.all(cross_attention_mask == 1), f"Cross attention mask is not all ones: {cross_attention_mask}"
)
# Test batch
text = [
"<|image|>This is a test sentence.",
"This is a test sentence.<|image|><|image|>This is a test sentence.",
]
# fmt: off
expected_ids = [
[self.image_token_id, self.bos_token_id, 2028, 374, 264, 1296, 11914, 13],
[self.bos_token_id, 2028, 374, 264, 1296, 11914, 13, self.image_token_id, self.image_token_id, 2028, 374, 264, 1296, 11914, 13],
]
# fmt: on
images = [[self.image1], [self.image1, self.image2]]
inputs = processor(text=text, images=images, padding=True, size={"width": 256, "height": 256})
self.assertEqual(inputs["pixel_values"].shape, (2, 2, 4, 3, 256, 256))
for input_ids_i, attention_mask_i, expected_ids_i in zip(
inputs["input_ids"], inputs["attention_mask"], expected_ids
):
pad_ids = [id for id, m in zip(input_ids_i, attention_mask_i) if m == 0]
input_ids = [id for id, m in zip(input_ids_i, attention_mask_i) if m == 1]
self.assertEqual(input_ids, expected_ids_i)
self.assertEqual(pad_ids, [self.pad_token_id] * len(pad_ids))
cross_attention_mask = inputs["cross_attention_mask"]
self.assertEqual(cross_attention_mask.shape, (2, 15, 2, 4))
# Check that only first tile of first sample is attended to all text tokens
first_sample_mask = cross_attention_mask[0].copy()
first_image_first_tile_attention = first_sample_mask[:, :1, :1] # text tokens, images, tiles
self.assertTrue(
np.all(first_image_first_tile_attention == 1),
f"Cross attention mask is not all ones: {first_image_first_tile_attention}",
)
# zero out first tile of first image
first_image_first_tile_attention[:, :1, :1] = 0
self.assertTrue(
np.all(first_image_first_tile_attention == 0),
f"Cross attention mask is not all zeros: {first_image_first_tile_attention}",
)
# second sample
second_sample_mask = cross_attention_mask[1].copy()
first_image_first_tile_attention = second_sample_mask[7:, :1, :1] # text tokens, images, tiles
self.assertTrue(
np.all(first_image_first_tile_attention == 1),
f"Cross attention mask is not all ones: {first_image_first_tile_attention}",
)
second_image_two_tiles_attention = second_sample_mask[8:, 1:2, :2] # text tokens, images, tiles
self.assertTrue(
np.all(second_image_two_tiles_attention == 1),
f"Cross attention mask is not all ones: {second_image_two_tiles_attention}",
)
# zero out both images masks
second_sample_mask[7:, :1, :1] = 0
second_sample_mask[8:, 1:2, :2] = 0
self.assertTrue(
np.all(second_sample_mask == 0), f"Cross attention mask is not all zeros: {second_sample_mask}"
)
def test_process_interleaved_images_prompts_image_error(self):
text = [
"This is a test sentence.",
"In this other sentence we try some good things",
]
processor = MllamaProcessor.from_pretrained(self.tmpdirname)
inputs = processor(text=text, images=None, padding=True)
self.assertIsNotNone(inputs["input_ids"])
text = [
"This is a test sentence.<|image|>",
"In this other sentence we try some good things",
]
with self.assertRaises(ValueError):
processor(text=text, images=None, padding=True)
images = [[self.image1], []]
with self.assertRaises(ValueError):
processor(text=text, images=images, padding=True)
text = [
"This is a test sentence.<|image|>",
"In this other sentence we try some good things<|image|>",
]
with self.assertRaises(ValueError):
processor(text=text, images=None, padding=True)
text = [
"This is a test sentence.<|image|>",
"In this other sentence we try some good things<|image|>",
]
images = [[self.image1], [self.image2]]
inputs = processor(text=text, images=images, padding=True)
images = [[self.image1, self.image2], []]
with self.assertRaises(ValueError):
processor(text=text, images=None, padding=True)
# see https://github.com/huggingface/transformers/pull/35934
images = [self.image1, self.image2]
with self.assertRaises(ValueError):
processor(text=text, images=None, padding=True)
def test_unstructured_kwargs_batched(self):
# Overridden because Mllama expects images in nested format. For 2 images it can't infer
# the correct nesting, so we better throw an error
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
processor_components = self.prepare_components()
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
do_rescale=True,
rescale_factor=-1,
padding="longest",
max_length=76,
)
self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
self.assertTrue(
len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
and len(inputs[self.text_input_name][1]) < 76
)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=3,
)
@unittest.skip("Mllama can't process inputs with no image ttogether with multimodal inputs")
def test_processor_text_has_no_visual(self):
pass