init
This commit is contained in:
0
transformers/tests/models/paligemma/__init__.py
Normal file
0
transformers/tests/models/paligemma/__init__.py
Normal file
629
transformers/tests/models/paligemma/test_modeling_paligemma.py
Normal file
629
transformers/tests/models/paligemma/test_modeling_paligemma.py
Normal file
@@ -0,0 +1,629 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch PaliGemma model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
|
||||
from transformers import (
|
||||
PaliGemmaConfig,
|
||||
PaliGemmaForConditionalGeneration,
|
||||
PaliGemmaModel,
|
||||
PaliGemmaProcessor,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
cleanup,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class PaliGemmaVisionText2TextModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
ignore_index=-100,
|
||||
image_token_index=0,
|
||||
projector_hidden_act="gelu",
|
||||
seq_length=25,
|
||||
vision_feature_select_strategy="default",
|
||||
vision_feature_layer=-1,
|
||||
projection_dim=32,
|
||||
text_config={
|
||||
"model_type": "gemma",
|
||||
"seq_length": 128,
|
||||
"is_training": True,
|
||||
# "use_input_mask": True,
|
||||
"use_token_type_ids": False,
|
||||
"use_labels": True,
|
||||
"vocab_size": 99,
|
||||
"hidden_size": 32,
|
||||
"num_hidden_layers": 2,
|
||||
"num_attention_heads": 4,
|
||||
"num_key_value_heads": 1,
|
||||
"head_dim": 8,
|
||||
"intermediate_size": 37,
|
||||
"hidden_activation": "gelu_pytorch_tanh",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"max_position_embeddings": 512,
|
||||
"type_vocab_size": 16,
|
||||
"type_sequence_label_size": 2,
|
||||
"initializer_range": 0.02,
|
||||
"num_labels": 3,
|
||||
"num_choices": 4,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
is_training=True,
|
||||
vision_config={
|
||||
"use_labels": True,
|
||||
"image_size": 20,
|
||||
"patch_size": 5,
|
||||
"num_image_tokens": 4,
|
||||
"num_channels": 3,
|
||||
"is_training": True,
|
||||
"hidden_size": 32,
|
||||
"projection_dim": 32,
|
||||
"num_key_value_heads": 1,
|
||||
"num_hidden_layers": 2,
|
||||
"num_attention_heads": 4,
|
||||
"intermediate_size": 37,
|
||||
"dropout": 0.1,
|
||||
"attention_dropout": 0.1,
|
||||
"initializer_range": 0.02,
|
||||
},
|
||||
use_cache=False,
|
||||
):
|
||||
self.parent = parent
|
||||
self.ignore_index = ignore_index
|
||||
# `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
|
||||
self.image_token_index = image_token_index
|
||||
self.projector_hidden_act = projector_hidden_act
|
||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||
self.vision_feature_layer = vision_feature_layer
|
||||
self.text_config = text_config
|
||||
self.vision_config = vision_config
|
||||
self.seq_length = seq_length
|
||||
self.projection_dim = projection_dim
|
||||
self.pad_token_id = text_config["pad_token_id"]
|
||||
|
||||
self.num_hidden_layers = text_config["num_hidden_layers"]
|
||||
self.vocab_size = text_config["vocab_size"]
|
||||
self.hidden_size = text_config["hidden_size"]
|
||||
self.num_attention_heads = text_config["num_attention_heads"]
|
||||
self.is_training = is_training
|
||||
|
||||
self.batch_size = 3
|
||||
self.num_channels = vision_config["num_channels"]
|
||||
self.image_size = vision_config["image_size"]
|
||||
self.encoder_seq_length = seq_length
|
||||
self.use_cache = use_cache
|
||||
|
||||
def get_config(self):
|
||||
return PaliGemmaConfig(
|
||||
text_config=self.text_config,
|
||||
vision_config=self.vision_config,
|
||||
ignore_index=self.ignore_index,
|
||||
image_token_index=self.image_token_index,
|
||||
projector_hidden_act=self.projector_hidden_act,
|
||||
projection_dim=self.projection_dim,
|
||||
vision_feature_select_strategy=self.vision_feature_select_strategy,
|
||||
vision_feature_layer=self.vision_feature_layer,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor(
|
||||
[
|
||||
self.batch_size,
|
||||
self.vision_config["num_channels"],
|
||||
self.vision_config["image_size"],
|
||||
self.vision_config["image_size"],
|
||||
]
|
||||
)
|
||||
config = self.get_config()
|
||||
|
||||
return config, pixel_values
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, pixel_values = config_and_inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
|
||||
attention_mask = input_ids.ne(self.pad_token_id).to(torch_device)
|
||||
|
||||
# set the 16 first tokens to be image, and ensure that no other tokens are image tokens
|
||||
# do not change this unless you modified image size or patch size
|
||||
input_ids[input_ids == config.image_token_index] = self.pad_token_id
|
||||
input_ids[:, :16] = config.image_token_index
|
||||
inputs_dict = {
|
||||
"pixel_values": pixel_values,
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"labels": input_ids,
|
||||
"token_type_ids": torch.zeros_like(input_ids),
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Model tester for `PaliGemmaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (
|
||||
(
|
||||
PaliGemmaModel,
|
||||
PaliGemmaForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
|
||||
additional_model_inputs = ["token_type_ids"]
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_torchscript = False
|
||||
test_head_masking = False
|
||||
_is_composite = True
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = PaliGemmaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=PaliGemmaConfig, has_text_modality=False)
|
||||
|
||||
# Copied from tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest.test_mismatching_num_image_tokens
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
model.eval()
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values)
|
||||
|
||||
# two images and two image tokens don't raise an error
|
||||
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
|
||||
def test_cpu_offload(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
|
||||
def test_disk_offload_bin(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
|
||||
def test_disk_offload_safetensors(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
|
||||
def test_model_parallelism(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="PaliGemma's SigLip encoder uses a non-standard initialization scheme")
|
||||
def test_initialization(self):
|
||||
pass
|
||||
|
||||
# TODO extend valid outputs to include this test @Molbap
|
||||
@unittest.skip(reason="PaliGemma has currently one output format.")
|
||||
def test_model_outputs_equivalence(self):
|
||||
pass
|
||||
|
||||
# TODO fix the loss = nan in the testing configuration chosen @Molbap
|
||||
@unittest.skip(reason="Edge case giving loss nan values in testing configuration.")
|
||||
def test_determinism(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="PaliGemma does not use feedforward chunking.")
|
||||
def test_feed_forward_chunking(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
|
||||
)
|
||||
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Paligemma position ids are 1 indexed")
|
||||
def test_eager_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Paloigemma position ids are 1 indexed")
|
||||
def test_sdpa_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
def test_attention_mask_with_token_types(self):
|
||||
"""Test that attention masking works correctly both with and without token type IDs."""
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class._from_config(config, attn_implementation="eager")
|
||||
config = model.config
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
# Case 1: With token_type_ids
|
||||
outputs_with_types = model(
|
||||
**inputs_dict,
|
||||
output_attentions=True,
|
||||
)
|
||||
|
||||
# Case 2: Without token_type_ids
|
||||
inputs_no_types = {k: v for k, v in inputs_dict.items() if k != "token_type_ids"}
|
||||
outputs_no_types = model(
|
||||
**inputs_no_types,
|
||||
output_attentions=True,
|
||||
)
|
||||
|
||||
attention_outputs_with_types = outputs_with_types.attentions
|
||||
attention_outputs_no_types = outputs_no_types.attentions
|
||||
|
||||
# Verify pad tokens remain masked in both cases
|
||||
attention_mask = inputs_dict["attention_mask"]
|
||||
pad_positions = attention_mask == 0
|
||||
|
||||
for layer_attentions in [attention_outputs_with_types, attention_outputs_no_types]:
|
||||
for layer_attn in layer_attentions:
|
||||
# Check if pad tokens are properly masked
|
||||
for batch_idx in range(layer_attn.shape[0]):
|
||||
for seq_idx in range(layer_attn.shape[-1]):
|
||||
if pad_positions[batch_idx, seq_idx]:
|
||||
# Verify attention weights for pad tokens are zero
|
||||
self.assertTrue(
|
||||
torch.all(layer_attn[batch_idx, :, :, seq_idx] == 0),
|
||||
f"Found non-zero attention weights for padding token at batch {batch_idx}, sequence position {seq_idx}",
|
||||
)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch
|
||||
@require_read_token
|
||||
class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.processor = PaliGemmaProcessor.from_pretrained("google/paligemma-3b-pt-224")
|
||||
|
||||
def tearDown(self):
|
||||
cleanup(torch_device, gc_collect=True)
|
||||
|
||||
def test_small_model_integration_test(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
|
||||
prompt = ""
|
||||
image_file = (
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
|
||||
)
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
|
||||
EXPECTED_INPUT_IDS = torch.tensor([[257152] * 256 + [2, 108]])
|
||||
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
EXPECTED_DECODED_TEXT = "\ncow on the beach" # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
def test_small_model_integration_test_multiimage(self):
|
||||
model_id = "google/paligemma-3b-ft-nlvr2-448" # checkpoint tuned for multiple images
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
|
||||
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
||||
prompt = "answer en There is no snowman in any of the images. Is this true or false?"
|
||||
stop_sign_image = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
snow_image = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg", stream=True
|
||||
).raw
|
||||
)
|
||||
|
||||
inputs = processor(text=prompt, images=[[snow_image, snow_image]], return_tensors="pt")
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
EXPECTED_DECODED_TEXT = "answer en There is no snowman in any of the images. Is this true or false?\nFalse"
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
# try another prompt with two different image this time
|
||||
prompt = "answer en There is exactly one snowman. Is this true or false?"
|
||||
inputs = processor(text=prompt, images=[[snow_image, stop_sign_image]], return_tensors="pt")
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
EXPECTED_DECODED_TEXT = "answer en There is exactly one snowman. Is this true or false?\nTrue"
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
def test_small_model_integration_test_paligemma_VQA(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
|
||||
prompt = "answer en Where is the cow standing?"
|
||||
image_file = (
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
|
||||
)
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch.float16)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
|
||||
EXPECTED_DECODED_TEXT = "answer en Where is the cow standing?\nbeach" # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
def test_small_model_integration_test_paligemma_empty_prompt(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
|
||||
|
||||
prompt = ""
|
||||
image_file = (
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
|
||||
)
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch.float16)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
|
||||
EXPECTED_DECODED_TEXT = "\ncow on the beach" # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
def test_small_model_integration_test_paligemma_batched(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
|
||||
|
||||
prompts = [
|
||||
"answer en Where is the cow standing?",
|
||||
"",
|
||||
]
|
||||
image1 = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
image2 = image1
|
||||
|
||||
inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"] # fmt: skip
|
||||
|
||||
self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
def test_small_model_integration_test_paligemma_batched_bf16(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(
|
||||
model_id, revision="bfloat16", dtype=torch.bfloat16
|
||||
).to(torch_device)
|
||||
# The first batch is longer in terms of text, the second will be padded.
|
||||
prompts = [
|
||||
"answer en Where is the cow standing?",
|
||||
"",
|
||||
]
|
||||
image1 = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
image2 = image1
|
||||
|
||||
inputs = (
|
||||
self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
|
||||
.to(torch.bfloat16)
|
||||
.to(torch_device)
|
||||
)
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"] # fmt: skip
|
||||
self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
def test_small_model_integration_test_paligemma_batched_f16(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(
|
||||
model_id, revision="float16", dtype=torch.float16
|
||||
).to(torch_device)
|
||||
# The first batch is longer in terms of text, the second will be padded.
|
||||
prompts = [
|
||||
"answer en Where is the cow standing?",
|
||||
"",
|
||||
]
|
||||
image1 = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
image2 = image1
|
||||
|
||||
inputs = (
|
||||
self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
|
||||
.to(torch.float16)
|
||||
.to(torch_device)
|
||||
)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"] # fmt: skip
|
||||
self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
def test_integration_detection_bug(self):
|
||||
# this is a reproducer of https://github.com/huggingface/transformers/issues/31425 where not enough context
|
||||
# impacted negatively segmentation generations.
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(
|
||||
model_id, revision="bfloat16", dtype=torch.bfloat16
|
||||
).to(torch_device)
|
||||
prompt = ("detect shoe",)
|
||||
|
||||
image = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/shoe.png",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
|
||||
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(torch.bfloat16).to(torch_device)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
expected_decoded_texts = Expectations(
|
||||
{
|
||||
("rocm", (9, 5)): "detect shoe\n<loc0051><loc0309><loc0708><loc0644> shoe",
|
||||
(None, None): "detect shoe\n<loc0051><loc0309><loc0708><loc0646> shoe",
|
||||
("cuda", 8): "detect shoe\n<loc0051><loc0309><loc0708><loc0646> shoe",
|
||||
}
|
||||
) # fmt: skip
|
||||
EXPECTED_DECODED_TEXT = expected_decoded_texts.get_expectation()
|
||||
self.assertEqual(self.processor.decode(output[0], skip_special_tokens=True), EXPECTED_DECODED_TEXT)
|
||||
|
||||
def test_paligemma_index_error_bug(self):
|
||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore
|
||||
# Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
|
||||
# more details
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
|
||||
|
||||
# Simulate a super long prompt
|
||||
prompt = "\n" * 200
|
||||
image_file = (
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
|
||||
)
|
||||
|
||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
inputs = self.processor(
|
||||
images=raw_image,
|
||||
text=prompt,
|
||||
return_tensors="pt",
|
||||
).to(torch.float16)
|
||||
|
||||
# Make sure that `generate` works
|
||||
_ = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
def test_paligemma_finetuning_with_suffixes_bf16(self):
|
||||
# this is a supplementary test to ensure paligemma fine-tuning that relies on token_type_ids is robust to future changes
|
||||
model_id = "google/paligemma-3b-pt-224"
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(
|
||||
model_id, revision="bfloat16", dtype=torch.bfloat16
|
||||
).to(torch_device)
|
||||
# The first batch is longer in terms of text, the second will be padded.
|
||||
prompts = [
|
||||
"answer en Where is the cow standing?",
|
||||
"",
|
||||
]
|
||||
|
||||
suffixes = ["beach", "cow standing on the beach"]
|
||||
image1 = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
|
||||
stream=True,
|
||||
).raw
|
||||
)
|
||||
image2 = image1
|
||||
|
||||
inputs = (
|
||||
self.processor(images=[image1, image2], text=prompts, suffix=suffixes, return_tensors="pt", padding=True)
|
||||
.to(torch.bfloat16)
|
||||
.to(torch_device)
|
||||
)
|
||||
|
||||
expected_labels = torch.tensor(
|
||||
[266 * [-100] + [54901, 1], 262 * [-100] + [14706, 9980, 611, 573, 8318, 1]]
|
||||
).to(torch_device)
|
||||
|
||||
assert torch.equal(inputs["labels"], expected_labels)
|
||||
|
||||
expected_token_type_ids = torch.tensor([266 * [0] + 2 * [1], 262 * [0] + 6 * [1]]).to(torch_device)
|
||||
|
||||
assert torch.equal(inputs["token_type_ids"], expected_token_type_ids)
|
||||
|
||||
output = model(**inputs)
|
||||
|
||||
# check that loss does not error out
|
||||
_ = output.loss
|
||||
124
transformers/tests/models/paligemma/test_processing_paligemma.py
Normal file
124
transformers/tests/models/paligemma/test_processing_paligemma.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import GemmaTokenizer, PaliGemmaProcessor
|
||||
from transformers.testing_utils import get_tests_dir, require_torch, require_vision
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import SiglipImageProcessor
|
||||
|
||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||
|
||||
|
||||
@require_vision
|
||||
class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = PaliGemmaProcessor
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
|
||||
image_processor.image_seq_length = 0 # TODO: raushan fix me in #37342
|
||||
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
|
||||
processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
cls.image_token = processor.image_token
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
|
||||
# Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||
def test_get_num_vision_tokens(self):
|
||||
"Tests general functionality of the helper used internally in vLLM"
|
||||
|
||||
processor = self.get_processor()
|
||||
|
||||
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||
self.assertTrue("num_image_tokens" in output)
|
||||
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||
|
||||
self.assertTrue("num_image_patches" in output)
|
||||
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_image_seq_length(self):
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer", max_length=112, padding="max_length")
|
||||
image_processor.image_seq_length = 14
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
inputs = processor(
|
||||
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
|
||||
)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 112)
|
||||
|
||||
@require_torch
|
||||
def test_call_with_suffix(self):
|
||||
input_str = "lower newer"
|
||||
suffix = "upper older longer string"
|
||||
image_input = self.prepare_image_inputs()
|
||||
processor = self.get_processor()
|
||||
inputs = processor(text=input_str, images=image_input, suffix=suffix)
|
||||
self.assertTrue("labels" in inputs)
|
||||
self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, suffix=suffix, return_tensors="pt")
|
||||
self.assertTrue("labels" in inputs)
|
||||
self.assertEqual(len(inputs["labels"][0]), len(inputs["input_ids"][0]))
|
||||
|
||||
def test_text_with_image_tokens(self):
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
text_multi_images = "<image><image>Dummy text!"
|
||||
text_single_image = "<image>Dummy text!"
|
||||
text_no_image = "Dummy text!"
|
||||
|
||||
image = self.prepare_image_inputs()
|
||||
|
||||
out_noimage = processor(text=text_no_image, images=image, return_tensors="np")
|
||||
out_singlimage = processor(text=text_single_image, images=image, return_tensors="np")
|
||||
for k in out_noimage:
|
||||
self.assertTrue(out_noimage[k].tolist() == out_singlimage[k].tolist())
|
||||
|
||||
out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np")
|
||||
out_noimage = processor(text=text_no_image, images=[[image, image]], return_tensors="np")
|
||||
|
||||
# We can't be sure what is users intention, whether user want "one text + two images" or user forgot to add the second text
|
||||
with self.assertRaises(ValueError):
|
||||
out_noimage = processor(text=text_no_image, images=[image, image], return_tensors="np")
|
||||
|
||||
for k in out_noimage:
|
||||
self.assertTrue(out_noimage[k].tolist() == out_multiimages[k].tolist())
|
||||
|
||||
text_batched = ["Dummy text!", "Dummy text!"]
|
||||
text_batched_with_image = ["<image>Dummy text!", "<image>Dummy text!"]
|
||||
out_images = processor(text=text_batched_with_image, images=[image, image], return_tensors="np")
|
||||
out_noimage_nested = processor(text=text_batched, images=[[image], [image]], return_tensors="np")
|
||||
out_noimage = processor(text=text_batched, images=[image, image], return_tensors="np")
|
||||
for k in out_noimage:
|
||||
self.assertTrue(out_noimage[k].tolist() == out_images[k].tolist() == out_noimage_nested[k].tolist())
|
||||
Reference in New Issue
Block a user