init
This commit is contained in:
0
transformers/tests/models/fuyu/__init__.py
Normal file
0
transformers/tests/models/fuyu/__init__.py
Normal file
63
transformers/tests/models/fuyu/test_image_processing_fuyu.py
Normal file
63
transformers/tests/models/fuyu/test_image_processing_fuyu.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import is_torch_available, is_vision_available
|
||||
from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_torchvision,
|
||||
require_vision,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available() and is_vision_available():
|
||||
import torch
|
||||
|
||||
from transformers import FuyuImageProcessor
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
@require_torchvision
|
||||
class TestFuyuImageProcessor(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.size = {"height": 160, "width": 320}
|
||||
self.processor = FuyuImageProcessor(size=self.size, padding_value=1.0)
|
||||
self.batch_size = 3
|
||||
self.channels = 3
|
||||
self.height = 300
|
||||
self.width = 300
|
||||
|
||||
self.image_input = torch.rand(self.batch_size, self.channels, self.height, self.width)
|
||||
|
||||
self.image_patch_dim_h = 30
|
||||
self.image_patch_dim_w = 30
|
||||
self.sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
self.sample_image_pil = Image.fromarray(self.sample_image)
|
||||
|
||||
def test_patches(self):
|
||||
expected_num_patches = self.processor.get_num_patches(image_height=self.height, image_width=self.width)
|
||||
|
||||
patches_final = self.processor.patchify_image(image=self.image_input)
|
||||
assert patches_final.shape[1] == expected_num_patches, (
|
||||
f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
|
||||
)
|
||||
|
||||
def test_scale_to_target_aspect_ratio(self):
|
||||
# (h:450, w:210) fitting (160, 320) -> (160, 210*160/450)
|
||||
scaled_image = self.processor.resize(self.sample_image, size=self.size)
|
||||
self.assertEqual(scaled_image.shape[0], 160)
|
||||
self.assertEqual(scaled_image.shape[1], 74)
|
||||
|
||||
def test_apply_transformation_numpy(self):
|
||||
transformed_image = self.processor.preprocess(self.sample_image).images[0][0]
|
||||
self.assertEqual(transformed_image.shape[1], 160)
|
||||
self.assertEqual(transformed_image.shape[2], 320)
|
||||
|
||||
def test_apply_transformation_pil(self):
|
||||
transformed_image = self.processor.preprocess(self.sample_image_pil).images[0][0]
|
||||
self.assertEqual(transformed_image.shape[1], 160)
|
||||
self.assertEqual(transformed_image.shape[2], 320)
|
||||
358
transformers/tests/models/fuyu/test_modeling_fuyu.py
Normal file
358
transformers/tests/models/fuyu/test_modeling_fuyu.py
Normal file
@@ -0,0 +1,358 @@
|
||||
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Fuyu model."""
|
||||
|
||||
import copy
|
||||
import io
|
||||
import unittest
|
||||
from functools import cached_property
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import FuyuConfig, is_torch_available, is_vision_available
|
||||
from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
|
||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
if is_torch_available() and is_vision_available():
|
||||
from transformers import FuyuProcessor
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import FuyuForCausalLM, FuyuModel
|
||||
|
||||
|
||||
class FuyuModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
num_image_tokens=2,
|
||||
image_size=30,
|
||||
patch_size=15,
|
||||
num_channels=3,
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
pad_token_id=10,
|
||||
image_token_id=1,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_image_tokens = num_image_tokens
|
||||
self.seq_length = seq_length + num_image_tokens
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.pad_token_id = pad_token_id
|
||||
self.image_token_id = image_token_id
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
config = self.get_config()
|
||||
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
input_ids[input_ids == config.image_token_id] = self.pad_token_id
|
||||
input_ids[:, : self.num_image_tokens] = config.image_token_id
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
|
||||
return config, input_ids, input_mask, sequence_labels, token_labels
|
||||
|
||||
def get_config(self):
|
||||
return FuyuConfig(
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
pad_token_id=self.pad_token_id,
|
||||
image_token_id=self.image_token_id,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(
|
||||
config,
|
||||
input_ids,
|
||||
input_mask,
|
||||
sequence_labels,
|
||||
token_labels,
|
||||
) = config_and_inputs
|
||||
image_patches = floats_tensor(
|
||||
[self.batch_size, self.num_image_tokens, config.num_channels * config.patch_size**2]
|
||||
)
|
||||
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask, "image_patches": image_patches}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (
|
||||
(
|
||||
FuyuModel,
|
||||
FuyuForCausalLM,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
|
||||
)
|
||||
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
test_cpu_offload = False
|
||||
test_disk_offload = False
|
||||
test_model_parallel = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = FuyuModelTester(self)
|
||||
|
||||
def test_mismatching_image_patches(self):
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
|
||||
# two image token and two image
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_ids = curr_input_dict["input_ids"]
|
||||
image_patches = curr_input_dict["image_patches"][1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(input_ids=input_ids, image_patches=image_patches)
|
||||
|
||||
# remove one image token from text
|
||||
input_ids = curr_input_dict["input_ids"][2:]
|
||||
image_patches = curr_input_dict["image_patches"]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(input_ids=input_ids, image_patches=image_patches)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||
pass
|
||||
|
||||
@parameterized.expand([("random",), ("same",)])
|
||||
@pytest.mark.generate
|
||||
@unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
|
||||
def test_assisted_decoding_matches_greedy_search(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
@unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
|
||||
def test_assisted_decoding_sample(self):
|
||||
pass
|
||||
|
||||
# TODO: Fix me (once this model gets more usage)
|
||||
@unittest.skip(reason="Does not work on the tiny model.")
|
||||
def test_disk_offload_bin(self):
|
||||
super().test_disk_offload()
|
||||
|
||||
# TODO: Fix me (once this model gets more usage)
|
||||
@unittest.skip(reason="Does not work on the tiny model.")
|
||||
def test_disk_offload_safetensors(self):
|
||||
super().test_disk_offload()
|
||||
|
||||
# TODO: Fix me (once this model gets more usage)
|
||||
@unittest.skip(reason="Does not work on the tiny model.")
|
||||
def test_model_parallelism(self):
|
||||
super().test_model_parallelism()
|
||||
|
||||
@unittest.skip(reason="Fuyu `prepare_inputs_for_generation` function doesn't have cache position.")
|
||||
def test_generate_continue_from_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
|
||||
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
|
||||
def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
|
||||
def test_eager_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
|
||||
def test_sdpa_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
class FuyuModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_processor(self):
|
||||
return FuyuProcessor.from_pretrained("adept/fuyu-8b")
|
||||
|
||||
@cached_property
|
||||
def default_model(self):
|
||||
return FuyuForCausalLM.from_pretrained("adept/fuyu-8b", dtype="float16", device_map=torch_device)
|
||||
|
||||
def test_greedy_generation(self):
|
||||
processor = self.default_processor
|
||||
model = self.default_model
|
||||
|
||||
url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
|
||||
image = Image.open(io.BytesIO(requests.get(url).content))
|
||||
|
||||
text_prompt_coco_captioning = "Generate a coco-style caption.\n"
|
||||
|
||||
inputs = processor(images=image, text=text_prompt_coco_captioning, return_tensors="pt").to(
|
||||
torch_device, torch.float16
|
||||
)
|
||||
generated_ids = model.generate(**inputs, max_new_tokens=10)
|
||||
|
||||
# take the last 8 tokens (in order to skip special \n\x04 characters) and decode them
|
||||
generated_text = processor.batch_decode(generated_ids[:, -8:], skip_special_tokens=True)[0]
|
||||
self.assertEqual(generated_text, "A blue bus parked on the side of a road.")
|
||||
|
||||
|
||||
"""
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
def test_model_8b_chat_greedy_generation_bus_color(self):
|
||||
EXPECTED_TEXT_COMPLETION = "The bus is blue.\n|ENDOFTEXT|"
|
||||
text_prompt_bus_color = "What color is the bus?\n"
|
||||
model_inputs_bus_color = self.processor(text=text_prompt_bus_color, images=self.bus_image_pil)
|
||||
|
||||
generated_tokens = self.model.generate(**model_inputs_bus_color, max_new_tokens=10)
|
||||
text = self.processor.tokenizer.batch_decode(generated_tokens)
|
||||
end_sequence = text[0].split("\x04")[1]
|
||||
clean_sequence = (
|
||||
end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
|
||||
if "|ENDOFTEXT|" in end_sequence
|
||||
else end_sequence
|
||||
)
|
||||
self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
def test_model_8b_chat_greedy_generation_chart_vqa(self):
|
||||
EXPECTED_TEXT_TOKENS = ["The","life expectancy","at","birth","of male","s in","","20","18","is","","80",".","7",".","\n","|ENDOFTEXT|",] # fmt: skip
|
||||
expected_text_completion = " ".join(EXPECTED_TEXT_TOKENS) # TODO make sure the end string matches
|
||||
|
||||
text_prompt_chart_vqa = "What is the highest life expectancy at birth of male?\n"
|
||||
|
||||
chart_image_url = (
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/chart.png"
|
||||
)
|
||||
chart_image_pil = Image.open(io.BytesIO(requests.get(chart_image_url).content))
|
||||
|
||||
model_inputs_chart_vqa = self.processor(text=text_prompt_chart_vqa, images=chart_image_pil)
|
||||
generated_tokens = self.model.generate(**model_inputs_chart_vqa, max_new_tokens=10)
|
||||
text = self.processor.tokenizer.batch_decode(generated_tokens)
|
||||
end_sequence = text[0].split("\x04")[1]
|
||||
clean_sequence = (
|
||||
end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
|
||||
if "|ENDOFTEXT|" in end_sequence
|
||||
else end_sequence
|
||||
)
|
||||
self.assertEqual(expected_text_completion, clean_sequence)
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
def test_model_8b_chat_greedy_generation_bounding_box(self):
|
||||
EXPECTED_TEXT_COMPLETION = "\x00194213202244\x01|ENDOFTEXT|"
|
||||
text_prompt_bbox = "When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\\nWilliams" # noqa: E231
|
||||
|
||||
bbox_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bbox_sample_image.png"
|
||||
bbox_image_pil = Image.open(io.BytesIO(requests.get(bbox_image_url).content))
|
||||
|
||||
model_inputs_bbox = self.processor(text=text_prompt_bbox, images=bbox_image_pil)
|
||||
generated_tokens = self.model.generate(**model_inputs_bbox, max_new_tokens=10)
|
||||
text = self.processor.tokenizer.batch_decode(generated_tokens)
|
||||
end_sequence = text[0].split("\x04")[1]
|
||||
clean_sequence = (
|
||||
end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
|
||||
if "|ENDOFTEXT|" in end_sequence
|
||||
else end_sequence
|
||||
)
|
||||
self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
|
||||
"""
|
||||
439
transformers/tests/models/fuyu/test_processing_fuyu.py
Normal file
439
transformers/tests/models/fuyu/test_processing_fuyu.py
Normal file
@@ -0,0 +1,439 @@
|
||||
import tempfile
|
||||
import unittest
|
||||
from shutil import rmtree
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
FuyuImageProcessor,
|
||||
FuyuProcessor,
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.image_utils import load_image
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers.models.fuyu.processing_fuyu import construct_full_unpacked_stream, full_unpacked_stream_to_tensor
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = FuyuProcessor
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
image_processor = FuyuImageProcessor()
|
||||
tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
|
||||
|
||||
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
|
||||
cls.text_prompt = "Generate a coco-style caption.\\n"
|
||||
bus_image_url = url_to_local_path(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
|
||||
)
|
||||
cls.bus_image_pil = load_image(bus_image_url)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
rmtree(cls.tmpdirname)
|
||||
|
||||
def get_processor(self):
|
||||
image_processor = FuyuImageProcessor()
|
||||
tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
|
||||
processor = FuyuProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
|
||||
|
||||
return processor
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
# Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||
def test_get_num_vision_tokens(self):
|
||||
"Tests general functionality of the helper used internally in vLLM"
|
||||
|
||||
processor = self.get_processor()
|
||||
|
||||
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||
self.assertTrue("num_image_tokens" in output)
|
||||
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||
|
||||
self.assertTrue("num_image_patches" in output)
|
||||
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||
|
||||
def test_fuyu_processing(self):
|
||||
"""
|
||||
Test to ensure that the standard processing on a gold example matches adept's code.
|
||||
"""
|
||||
# fmt: off
|
||||
EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
|
||||
EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
|
||||
|
||||
one_image_bus_model_inputs = self.get_processor()(text=self.text_prompt, images=self.bus_image_pil)
|
||||
|
||||
# fmt: on
|
||||
torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS)
|
||||
torch.testing.assert_close(one_image_bus_model_inputs["input_ids"], EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS)
|
||||
|
||||
def test_fuyu_processing_no_image(self):
|
||||
"""
|
||||
Test to check processor works with just text input
|
||||
"""
|
||||
processor_outputs = self.get_processor()(text=self.text_prompt)
|
||||
tokenizer_outputs = self.get_tokenizer()(self.text_prompt)
|
||||
self.assertEqual(processor_outputs["input_ids"], tokenizer_outputs["input_ids"])
|
||||
|
||||
def test_fuyu_processing_no_text(self):
|
||||
"""
|
||||
Test to check processor works with just image input
|
||||
"""
|
||||
# fmt: off
|
||||
EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([
|
||||
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
|
||||
14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26,
|
||||
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
|
||||
41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
|
||||
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66,
|
||||
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
|
||||
81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93,
|
||||
94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
|
||||
108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
|
||||
121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133,
|
||||
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
|
||||
148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160,
|
||||
161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174,
|
||||
175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
|
||||
188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200,
|
||||
201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
|
||||
215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227,
|
||||
228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
|
||||
-1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
|
||||
255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267,
|
||||
268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
|
||||
282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294,
|
||||
295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
|
||||
]).to(torch.int64)
|
||||
# fmt: on
|
||||
|
||||
processor_outputs = self.get_processor()(images=self.bus_image_pil)
|
||||
self.assertTrue((processor_outputs["image_patches_indices"] == EXPECTED_IMAGE_PATCH_INPUTS).all())
|
||||
|
||||
def test_fuyu_processing_multiple_image_sample(self):
|
||||
"""
|
||||
Test to check processor works with multiple image inputs for a single text input
|
||||
"""
|
||||
# fmt: off
|
||||
SINGLE_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
|
||||
SINGLE_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
|
||||
|
||||
SINGLE_RESIZED_IMAGE_PATCH_INPUTS = torch.Tensor([[ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1, 12, 13, 14, -1, 15, 16, 17, -1, 18, 19, 20, -1, 21, 22, 23, -1, 24, 25, 26, -1, 27, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]])
|
||||
SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[ 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122]])
|
||||
# fmt: on
|
||||
|
||||
# Batch of two images - equally sized
|
||||
images = [self.bus_image_pil, self.bus_image_pil]
|
||||
processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
|
||||
|
||||
self.assertTrue(
|
||||
(
|
||||
processor_outputs["image_patches_indices"]
|
||||
== torch.cat([SINGLE_IMAGE_PATCH_INPUTS, SINGLE_IMAGE_PATCH_INPUTS], dim=0)
|
||||
).all()
|
||||
)
|
||||
self.assertTrue(
|
||||
(
|
||||
processor_outputs["input_ids"]
|
||||
== torch.cat([SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, SINGLE_PADDED_UNPACKED_TOKEN_INPUTS], dim=0)
|
||||
).all()
|
||||
)
|
||||
|
||||
# Processes single images with different sizes as expected
|
||||
images = [self.bus_image_pil]
|
||||
processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
|
||||
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all())
|
||||
self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all())
|
||||
|
||||
images = [self.bus_image_pil.resize((64, 300))]
|
||||
processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
|
||||
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all())
|
||||
self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all())
|
||||
|
||||
# Batch of two images - different sizes. Left-pads the smaller image inputs
|
||||
images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))]
|
||||
processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
|
||||
|
||||
padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1]
|
||||
padded_single_resized_image_patch = torch.cat(
|
||||
[torch.ones([1, padding_len_patch]) * -1, SINGLE_RESIZED_IMAGE_PATCH_INPUTS], dim=1
|
||||
)
|
||||
expected_image_patch_inputs = torch.cat([SINGLE_IMAGE_PATCH_INPUTS, padded_single_resized_image_patch], dim=0)
|
||||
|
||||
padding_len_token = (
|
||||
SINGLE_PADDED_UNPACKED_TOKEN_INPUTS.shape[1] - SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS.shape[1]
|
||||
)
|
||||
padded_single_resized_padded_unpacked_token_inputs = torch.cat(
|
||||
[torch.zeros([1, padding_len_token]), SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS], dim=1
|
||||
)
|
||||
expected_padded_unpacked_token_inputs = torch.cat(
|
||||
[SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, padded_single_resized_padded_unpacked_token_inputs], dim=0
|
||||
)
|
||||
|
||||
self.assertTrue((processor_outputs["image_patches_indices"] == expected_image_patch_inputs).all())
|
||||
self.assertTrue((processor_outputs["input_ids"] == expected_padded_unpacked_token_inputs).all())
|
||||
|
||||
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_kwargs_overrides_default_tokenizer_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=117)
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
# Fuyu uses tokenizer kwargs only when image is None.
|
||||
image_input = None
|
||||
|
||||
inputs = processor(
|
||||
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
|
||||
)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 112)
|
||||
|
||||
@unittest.skip("Fuyu processor does not support image_processor kwargs")
|
||||
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Fuyu processor does not support image_processor kwargs")
|
||||
def test_kwargs_overrides_default_image_processor_kwargs(self):
|
||||
pass
|
||||
|
||||
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_tokenizer_defaults_preserved_by_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
# Fuyu uses tokenizer kwargs only when image is None.
|
||||
image_input = None
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 117)
|
||||
|
||||
# Rewrite as Fuyu image processor does not return pixel values
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
# Fuyu uses tokenizer kwargs only when image is None.
|
||||
image_input = None
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
# Rewrite as Fuyu image processor does not return pixel values
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested_from_dict(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
# Fuyu uses tokenizer kwargs only when image is None.
|
||||
image_input = None
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
# Fuyu uses tokenizer kwargs only when image is None.
|
||||
image_input = None
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
# Fuyu uses tokenizer kwargs only when image is None.
|
||||
image_input = None
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
padding="longest",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 7)
|
||||
|
||||
def test_processor_text_has_no_visual(self):
|
||||
# Overwritten: Fuyu has a complicated processing so we don't check id values
|
||||
processor = self.get_processor()
|
||||
|
||||
text = self.prepare_text_inputs(batch_size=3, modalities="image")
|
||||
image_inputs = self.prepare_image_inputs(batch_size=3)
|
||||
processing_kwargs = {"return_tensors": "pt", "padding": True, "multi_page": True}
|
||||
|
||||
# Call with nested list of vision inputs
|
||||
image_inputs_nested = [[image] if not isinstance(image, list) else image for image in image_inputs]
|
||||
inputs_dict_nested = {"text": text, "images": image_inputs_nested}
|
||||
inputs = processor(**inputs_dict_nested, **processing_kwargs)
|
||||
self.assertTrue(self.text_input_name in inputs)
|
||||
|
||||
# Call with one of the samples with no associated vision input
|
||||
plain_text = "lower newer"
|
||||
image_inputs_nested[0] = []
|
||||
text[0] = plain_text
|
||||
inputs_dict_no_vision = {"text": text, "images": image_inputs_nested}
|
||||
inputs_nested = processor(**inputs_dict_no_vision, **processing_kwargs)
|
||||
self.assertTrue(self.text_input_name in inputs_nested)
|
||||
|
||||
|
||||
@require_torch
|
||||
class TestImageTextProcessingUtils(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.batch_size = 2
|
||||
self.new_seq_len = 8
|
||||
self.num_sub_sequences = 1
|
||||
|
||||
self.all_bi_tokens_to_place = [4, 6]
|
||||
self.full_unpacked_stream = [torch.tensor([1, 2, 3, 4]), torch.tensor([5, 6, 7, 8, 9, 10])]
|
||||
self.fill_value = 0
|
||||
|
||||
self.num_real_text_tokens = [[3, 2], [2, 4]]
|
||||
# Here the input stream is padded to avoid inconsistencies (current model release matches)
|
||||
self.input_stream = torch.tensor([[[1, 2, 3], [4, 5, 0]], [[6, 7, 0], [8, 9, 10]]])
|
||||
self.image_tokens = [
|
||||
[torch.tensor([1, 2]), torch.tensor([3])],
|
||||
[torch.tensor([4, 5, 6]), torch.tensor([7, 8])],
|
||||
]
|
||||
|
||||
def test_full_unpacked_stream_to_tensor(self):
|
||||
result = full_unpacked_stream_to_tensor(
|
||||
self.all_bi_tokens_to_place,
|
||||
self.full_unpacked_stream,
|
||||
self.fill_value,
|
||||
self.batch_size,
|
||||
self.new_seq_len,
|
||||
offset=0,
|
||||
)
|
||||
EXPECTED_TENSOR = torch.tensor([[1, 2, 3, 4, 0, 0, 0, 0], [5, 6, 7, 8, 9, 10, 0, 0]])
|
||||
self.assertTrue(torch.equal(result, EXPECTED_TENSOR))
|
||||
|
||||
def test_construct_full_unpacked_stream(self):
|
||||
result = construct_full_unpacked_stream(
|
||||
self.num_real_text_tokens, self.input_stream, self.image_tokens, self.batch_size, self.num_sub_sequences
|
||||
)
|
||||
EXPECTED_UNPACKED_STREAM = [torch.tensor([1, 2, 1, 2, 3]), torch.tensor([4, 5, 6, 6, 7])]
|
||||
for i in range(len(result)):
|
||||
self.assertTrue(torch.equal(result[i], EXPECTED_UNPACKED_STREAM[i]))
|
||||
|
||||
|
||||
@require_torch
|
||||
class TestProcessImagesForModelInput(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""
|
||||
Adding a mix of present and absent images.
|
||||
"""
|
||||
|
||||
self.image_input = torch.randn([1, 1, 3, 64, 64])
|
||||
self.image_present = torch.tensor([[1]])
|
||||
self.image_unpadded_h = torch.tensor([[45]]) # Adjusted for subsequence of 1
|
||||
self.image_unpadded_w = torch.tensor([[50]]) # Adjusted for subsequence of 1
|
||||
self.image_patch_dim_h = 16
|
||||
self.image_patch_dim_w = 16
|
||||
self.image_placeholder_id = 999
|
||||
self.image_newline_id = 888
|
||||
self.variable_sized = True
|
||||
self.image_processor = FuyuImageProcessor(
|
||||
patch_size={"height": self.image_patch_dim_h, "width": self.image_patch_dim_w}
|
||||
)
|
||||
|
||||
def test_process_images_for_model_input_fixed_sized(self):
|
||||
self.variable_sized = False
|
||||
result = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=self.image_input,
|
||||
image_present=self.image_present,
|
||||
image_unpadded_h=self.image_unpadded_h,
|
||||
image_unpadded_w=self.image_unpadded_w,
|
||||
image_placeholder_id=self.image_placeholder_id,
|
||||
image_newline_id=self.image_newline_id,
|
||||
variable_sized=self.variable_sized,
|
||||
)
|
||||
self.assertEqual(result["images"][0][0].shape, torch.Size([3, 64, 64]))
|
||||
Reference in New Issue
Block a user