This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,638 @@
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import inspect
import unittest
from functools import cached_property
from datasets import load_dataset
from transformers import UdopConfig, is_torch_available
from transformers.testing_utils import (
require_sentencepiece,
require_tokenizers,
require_torch,
require_vision,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
import torch.nn.functional as F
from transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor
class UdopModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
encoder_seq_length=7,
decoder_seq_length=9,
# For common tests
is_training=True,
use_attention_mask=True,
use_labels=True,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=32,
dropout_rate=0.1,
initializer_factor=0.002,
eos_token_id=1,
pad_token_id=0,
scope=None,
decoder_layers=None,
range_bbox=1000,
decoder_start_token_id=0,
):
self.parent = parent
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.scope = None
self.decoder_layers = decoder_layers
self.range_bbox = range_bbox
self.decoder_start_token_id = decoder_start_token_id
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
bbox = ids_tensor([self.batch_size, self.encoder_seq_length, 4], self.range_bbox).float()
# Ensure that bbox is legal
for i in range(bbox.shape[0]):
for j in range(bbox.shape[1]):
if bbox[i, j, 3] < bbox[i, j, 1]:
t = bbox[i, j, 3]
bbox[i, j, 3] = bbox[i, j, 1]
bbox[i, j, 1] = t
if bbox[i, j, 2] < bbox[i, j, 0]:
t = bbox[i, j, 2]
bbox[i, j, 2] = bbox[i, j, 0]
bbox[i, j, 0] = t
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
attention_mask = None
decoder_attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
lm_labels = None
if self.use_labels:
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = self.get_config()
return (
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
)
def get_config(self):
return UdopConfig(
vocab_size=self.vocab_size,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_decoder_layers=self.decoder_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor,
eos_token_id=self.eos_token_id,
bos_token_id=self.pad_token_id,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
)
def create_and_check_model(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopModel(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids=input_ids,
bbox=bbox,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
)
result = model(input_ids=input_ids, bbox=bbox, decoder_input_ids=decoder_input_ids)
decoder_output = result.last_hidden_state
decoder_past = result.past_key_values
encoder_output = result.encoder_last_hidden_state
self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
# There should be `num_layers` key value embeddings stored in decoder_past
self.parent.assertEqual(len(decoder_past), config.num_layers)
# There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
self.parent.assertEqual(len(decoder_past[0]), 4)
def create_and_check_with_lm_head(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopForConditionalGeneration(config=config).to(torch_device).eval()
outputs = model(
input_ids=input_ids,
bbox=bbox,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
labels=lm_labels,
)
self.parent.assertEqual(len(outputs), 4)
self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
self.parent.assertEqual(outputs["loss"].size(), ())
def create_and_check_generate_with_past_key_values(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopForConditionalGeneration(config=config).to(torch_device).eval()
torch.manual_seed(0)
output_without_past_cache = model.generate(
input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True, use_cache=False
)
torch.manual_seed(0)
output_with_past_cache = model.generate(
input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True
)
self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
def create_and_check_model_fp16_forward(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopForConditionalGeneration(config=config).to(torch_device).half().eval()
output = model(input_ids, bbox=bbox, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids).logits
self.parent.assertFalse(torch.isnan(output).any().item())
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"bbox": bbox,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
"use_cache": False,
}
return config, inputs_dict
@require_torch
class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
UdopModel,
UdopForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{"feature-extraction": UdopModel, "image-text-to-text": UdopForConditionalGeneration}
if is_torch_available()
else {}
)
fx_compatible = False
test_pruning = False
test_torchscript = False
test_head_masking = False
test_resize_embeddings = True
test_model_parallel = False
is_encoder_decoder = True
test_cpu_offload = False
# The small UDOP model needs higher percentages for CPU/MP tests
model_split_percents = [0.8, 0.9]
def setUp(self):
self.model_tester = UdopModelTester(self)
self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)
if model_class.__name__ == "UdopForConditionalGeneration":
if return_labels:
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
)
return inputs_dict
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_with_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
def test_generate_with_past_key_values(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
def test_model_fp16_forward(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@unittest.skip(reason="Gradient checkpointing is not supported by this model")
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant(self):
pass
@unittest.skip(
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = sorted([*signature.parameters.keys()])
expected_arg_names = [
"attention_mask",
"bbox",
"cache_position",
"cross_attn_head_mask",
"decoder_attention_mask",
"decoder_head_mask",
"decoder_input_ids",
"decoder_inputs_embeds",
"encoder_outputs",
"head_mask",
"input_ids",
"inputs_embeds",
]
if model_class in self.all_generative_model_classes:
expected_arg_names.append(
"labels",
)
expected_arg_names = sorted(expected_arg_names)
self.assertListEqual(sorted(arg_names[: len(expected_arg_names)]), expected_arg_names)
# overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids`
def test_custom_4d_attention_mask(self):
for model_class in self.all_generative_model_classes:
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config).to(device=torch_device, dtype=torch.float32)
(
input_ids,
_,
input_ids_shared_prefix,
mask_shared_prefix,
_,
) = self._get_custom_4d_mask_test_data()
logits = model.forward(
decoder_input_ids=input_ids,
input_ids=input_dict["input_ids"][:3],
bbox=input_dict["bbox"][:3],
).logits
# logits.shape == torch.Size([3, 4, ...])
logits_shared_prefix = model(
input_ids=input_dict["input_ids"][:1],
bbox=input_dict["bbox"][:1],
decoder_input_ids=input_ids_shared_prefix,
decoder_attention_mask=mask_shared_prefix,
)[0]
# logits_shared_prefix.shape == torch.Size([1, 6, ...])
out_last_tokens = logits[:, -1, :] # last tokens in each batch line
out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens
# comparing softmax-normalized logits:
normalized_0 = F.softmax(out_last_tokens)
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
@slow
def test_model_from_pretrained(self):
model_name = "microsoft/udop-large"
model = UdopForConditionalGeneration.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="TODO: Fix me @joao")
def test_generate_without_input_ids(self):
pass
class UdopEncoderOnlyModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
seq_length=7,
# For common tests
is_training=False,
use_attention_mask=True,
hidden_size=32,
num_hidden_layers=2,
decoder_layers=2,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=32,
dropout_rate=0.1,
initializer_factor=0.002,
eos_token_id=1,
pad_token_id=0,
scope=None,
range_bbox=1000,
):
self.parent = parent
self.batch_size = batch_size
# For common tests
self.seq_length = seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.decoder_layers = decoder_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.scope = None
self.range_bbox = range_bbox
def get_config(self):
return UdopConfig(
vocab_size=self.vocab_size,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_decoder_layers=self.decoder_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor,
eos_token_id=self.eos_token_id,
bos_token_id=self.pad_token_id,
pad_token_id=self.pad_token_id,
is_encoder_decoder=False,
)
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox).float()
# Ensure that bbox is legal
for i in range(bbox.shape[0]):
for j in range(bbox.shape[1]):
if bbox[i, j, 3] < bbox[i, j, 1]:
t = bbox[i, j, 3]
bbox[i, j, 3] = bbox[i, j, 1]
bbox[i, j, 1] = t
if bbox[i, j, 2] < bbox[i, j, 0]:
t = bbox[i, j, 2]
bbox[i, j, 2] = bbox[i, j, 0]
bbox[i, j, 0] = t
attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
config = self.get_config()
return (
config,
input_ids,
bbox,
attention_mask,
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
bbox,
attention_mask,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"bbox": bbox,
"attention_mask": attention_mask,
}
return config, inputs_dict
def create_and_check_model(
self,
config,
input_ids,
bbox,
attention_mask,
):
model = UdopEncoderModel(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
)
encoder_output = result.last_hidden_state
self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_model_fp16_forward(
self,
config,
input_ids,
bbox,
attention_mask,
):
model = UdopEncoderModel(config=config).to(torch_device).half().eval()
output = model(input_ids, bbox=bbox, attention_mask=attention_mask)["last_hidden_state"]
self.parent.assertFalse(torch.isnan(output).any().item())
class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (UdopEncoderModel,) if is_torch_available() else ()
test_pruning = False
test_torchscript = False
test_head_masking = False
test_resize_embeddings = False
test_model_parallel = False
all_parallelizable_model_classes = (UdopEncoderModel,) if is_torch_available() else ()
def setUp(self):
self.model_tester = UdopEncoderOnlyModelTester(self)
self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
# overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids`
def test_custom_4d_attention_mask(self):
for model_class in self.all_generative_model_classes:
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config).to(device=torch_device, dtype=torch.float32)
(
input_ids,
_,
input_ids_shared_prefix,
mask_shared_prefix,
_,
) = self._get_custom_4d_mask_test_data()
logits = model.forward(
decoder_input_ids=input_ids,
input_ids=input_dict["input_ids"][:3],
).logits
# logits.shape == torch.Size([3, 4, ...])
logits_shared_prefix = model(
input_ids=input_dict["input_ids"][:1],
decoder_input_ids=input_ids_shared_prefix,
decoder_attention_mask=mask_shared_prefix,
)[0]
# logits_shared_prefix.shape == torch.Size([1, 6, ...])
out_last_tokens = logits[:, -1, :] # last tokens in each batch line
out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens
# comparing softmax-normalized logits:
normalized_0 = F.softmax(out_last_tokens)
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
@require_torch
@require_sentencepiece
@require_tokenizers
@require_vision
@slow
class UdopModelIntegrationTests(unittest.TestCase):
@cached_property
def image(self):
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
return ds[1]["image"]
@cached_property
def processor(self):
return UdopProcessor.from_pretrained("microsoft/udop-large")
@cached_property
def model(self):
return UdopForConditionalGeneration.from_pretrained("microsoft/udop-large").to(torch_device)
def test_conditional_generation(self):
processor = self.processor
model = self.model
prompt = "Question answering. In which year is the report made?"
encoding = processor(images=self.image, text=prompt, return_tensors="pt").to(torch_device)
predicted_ids = model.generate(**encoding)
predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
self.assertEqual(predicted_text, "2013")

View File

@@ -0,0 +1,500 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
from functools import cached_property
from transformers import (
PreTrainedTokenizer,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
UdopProcessor,
UdopTokenizer,
UdopTokenizerFast,
)
from transformers.testing_utils import (
require_pytesseract,
require_sentencepiece,
require_tokenizers,
require_torch,
slow,
)
from transformers.utils import is_pytesseract_available, is_torch_available
from ...test_processing_common import ProcessorTesterMixin
if is_torch_available():
import torch
if is_pytesseract_available():
from transformers import LayoutLMv3ImageProcessor
@require_pytesseract
@require_sentencepiece
@require_tokenizers
class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenizer_class = UdopTokenizer
rust_tokenizer_class = UdopTokenizerFast
processor_class = UdopProcessor
maxDiff = None
@classmethod
def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp()
image_processor = LayoutLMv3ImageProcessor(
do_resize=True,
size=224,
apply_ocr=True,
)
tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large")
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(cls.tmpdirname)
cls.tokenizer_pretrained_name = "microsoft/udop-large"
image_processor = cls.get_image_processor()
tokenizer = cls.get_tokenizers()[0]
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(cls.tmpdirname)
@classmethod
def get_tokenizer(cls, **kwargs) -> PreTrainedTokenizer:
return cls.tokenizer_class.from_pretrained(cls.tokenizer_pretrained_name, **kwargs)
@classmethod
def get_image_processor(cls, **kwargs):
return LayoutLMv3ImageProcessor.from_pretrained(cls.tmpdirname, **kwargs)
@classmethod
def get_rust_tokenizer(cls, **kwargs) -> PreTrainedTokenizerFast:
return cls.rust_tokenizer_class.from_pretrained(cls.tokenizer_pretrained_name, **kwargs)
@classmethod
def get_tokenizers(cls, **kwargs) -> list[PreTrainedTokenizerBase]:
return [cls.get_tokenizer(**kwargs), cls.get_rust_tokenizer(**kwargs)]
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
def test_save_load_pretrained_default(self):
image_processor = self.get_image_processor()
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
with tempfile.TemporaryDirectory() as tmpdir:
processor.save_pretrained(tmpdir)
processor = UdopProcessor.from_pretrained(tmpdir)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, (UdopTokenizer, UdopTokenizerFast))
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
def test_save_load_pretrained_additional_features(self):
with tempfile.TemporaryDirectory() as tmpdir:
processor = UdopProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
processor.save_pretrained(tmpdir)
# slow tokenizer
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
processor = UdopProcessor.from_pretrained(
tmpdir,
use_fast=False,
bos_token="(BOS)",
eos_token="(EOS)",
do_resize=False,
size=30,
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, UdopTokenizer)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
# fast tokenizer
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
processor = UdopProcessor.from_pretrained(
self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, UdopTokenizerFast)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
def test_text_target(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = UdopProcessor(tokenizer=tokenizer, image_processor=image_processor)
text = "hello world"
expected_decoding = "hello world</s>"
encoding_processor = processor(text_target=text)
encoding_tokenizer = tokenizer(text_target=text)
self.assertListEqual(encoding_processor["input_ids"], [21820, 296, 1])
self.assertListEqual(encoding_processor["attention_mask"], [1, 1, 1])
self.assertDictEqual(dict(encoding_processor), dict(encoding_tokenizer))
self.assertEqual(tokenizer.decode(encoding_processor["input_ids"]), expected_decoding)
@slow
def test_overflowing_tokens(self):
# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
def preprocess_data(examples):
images = [image.convert("RGB") for image in examples["image"]]
words = examples["words"]
boxes = examples["bboxes"]
word_labels = examples["ner_tags"]
encoded_inputs = processor(
images,
words,
boxes=boxes,
word_labels=word_labels,
max_length=512,
padding="max_length",
truncation=True,
return_overflowing_tokens=True,
stride=50,
return_offsets_mapping=True,
return_tensors="pt",
)
return encoded_inputs
train_data = preprocess_data(datasets["train"])
self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"]))
@unittest.skip("We will not support batch input with and without images for UDOP!")
def test_processor_text_has_no_visual(self):
pass
# different use cases tests
@require_sentencepiece
@require_torch
@require_pytesseract
class UdopProcessorIntegrationTests(unittest.TestCase):
@cached_property
def get_images(self):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")
@cached_property
def get_tokenizers(self):
slow_tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large")
fast_tokenizer = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
return [slow_tokenizer, fast_tokenizer]
@slow
def test_processor_case_1(self):
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
image_processor = LayoutLMv3ImageProcessor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
input_image_processor = image_processor(images[0], return_tensors="pt")
input_processor = processor(images[0], return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify pixel_values
self.assertTrue(
torch.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
input_image_processor = image_processor(images, return_tensors="pt")
input_processor = processor(images, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify pixel_values
self.assertTrue(
torch.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "7 ITC Limited REPORT AND ACCOUNTS 2013 ITCs Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITCs value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITCs brands national assets, adding to Indias competitiveness. It is ITCs aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
@slow
def test_processor_case_2(self):
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = list(input_processor.keys())
for key in expected_keys:
self.assertIn(key, actual_keys)
# verify input_ids
expected_decoding = "hello world</s>"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "hello world</s><pad><pad><pad><pad>"
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_3(self):
# case 3: token classification (training), apply_ocr=False
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
words = ["weirdly", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
word_labels = [1, 2]
input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "weirdly world</s>"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify labels
expected_labels = [1, -100, 2, -100]
self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
word_labels = [[1, 2], [6, 3, 10, 2]]
input_processor = processor(
images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "my name is niels</s>"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
# verify labels
expected_labels = [6, 3, 10, 2, -100, -100, -100]
self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
@slow
def test_processor_case_4(self):
# case 4: visual question answering (inference), apply_ocr=True
image_processor = LayoutLMv3ImageProcessor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
input_processor = processor(images[0], question, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "What's his name?</s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
input_processor = processor(
images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
expected_decoding = "what's the time</s> 7 ITC Limited REPORT AND ACCOUNTS 2013 I</s>"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
# fmt: off
expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [1000, 1000, 1000, 1000]] # noqa: E231
# fmt: on
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_5(self):
# case 5: visual question answering (inference), apply_ocr=False
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], question, text_pair=words, boxes=boxes, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "What's his name?</s> hello world</s>"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(
images, questions, text_pair=words, boxes=boxes, padding=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "How old is he?</s> hello world</s><pad><pad><pad>"
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
expected_decoding = "what's the time</s> my name is niels</s>"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [[3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)

File diff suppressed because it is too large Load Diff