This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,224 @@
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
if is_torchvision_available():
from transformers import PerceptionLMImageProcessorFast
class PerceptionLMImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
tile_size=16,
do_normalize=True,
image_mean=IMAGENET_STANDARD_MEAN,
image_std=IMAGENET_STANDARD_STD,
do_convert_rgb=True,
max_num_tiles=4,
vision_input_type="thumb+tile",
resample=Image.Resampling.BICUBIC, # dummy value
size={"shortest_edge": 20}, # dummy value
):
super().__init__()
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.tile_size = tile_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
self.max_num_tiles = max_num_tiles
self.vision_input_type = vision_input_type
self.resample = resample
self.size = size
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"tile_size": self.tile_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
"max_num_tiles": self.max_num_tiles,
"vision_input_type": self.vision_input_type,
"resample": self.resample,
"size": self.size,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class PerceptionLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
fast_image_processing_class = PerceptionLMImageProcessorFast if is_torchvision_available() else None
test_slow_image_processor = False
def setUp(self):
super().setUp()
self.image_processor_tester = PerceptionLMImageProcessingTester(self)
@property
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "tile_size"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "max_num_tiles"))
self.assertTrue(hasattr(image_processing, "vision_input_type"))
def test_image_processor_from_dict_with_kwargs(self):
for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.tile_size, 16)
self.assertEqual(image_processor.max_num_tiles, 4)
self.assertEqual(image_processor.vision_input_type, "thumb+tile")
image_processor = image_processing_class.from_dict(
self.image_processor_dict, tile_size=42, max_num_tiles=9
)
self.assertEqual(image_processor.tile_size, 42)
self.assertEqual(image_processor.max_num_tiles, 9)
self.assertEqual(image_processor.vision_input_type, "thumb+tile")
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = (1, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = (1, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = (1, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@unittest.skip(reason="PerceptionLMImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
def test_call_numpy_4_channels(self):
pass
def test_nested_input(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
# Test batched as a list of images
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched as a nested list of images, where each sublist is one batch
image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 5, 3, 16, 16)
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
# Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all())

View File

@@ -0,0 +1,485 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch PerceptionLM model."""
import unittest
from huggingface_hub import hf_hub_download
from transformers import (
AutoProcessor,
PerceptionLMConfig,
PerceptionLMForConditionalGeneration,
PerceptionLMModel,
is_torch_available,
)
from transformers.testing_utils import (
cleanup,
require_bitsandbytes,
require_read_token,
require_torch,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
class PerceptionLMVisionText2TextModelTester:
def __init__(
self,
parent,
image_token_id=0,
video_token_id=2,
seq_length=7,
tie_word_embeddings=True,
projector_pooling_ratio=1,
text_config={
"model_type": "llama",
"seq_length": 7,
"is_training": True,
"use_input_mask": True,
"use_token_type_ids": False,
"use_labels": True,
"vocab_size": 99,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 37,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"max_position_embeddings": 512,
"type_vocab_size": 16,
"type_sequence_label_size": 2,
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 1,
},
is_training=True,
vision_config={
"architecture": "vit_pe_core_large_patch14_336",
"model_args": {
"embed_dim": 64,
"img_size": (14, 14),
"depth": 2,
"global_pool": "",
"use_post_transformer_norm": False,
"init_values": 0.1,
"ref_feat_shape": (1, 1),
},
},
):
self.parent = parent
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.text_config = text_config
self.vision_config = vision_config
self.pad_token_id = text_config["pad_token_id"]
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.is_training = is_training
self.tie_word_embeddings = tie_word_embeddings
self.batch_size = 3
self.num_tiles = 1
self.num_frames = 1
self.num_channels = 3
self.image_size = self.vision_config["model_args"]["img_size"][0]
self.num_image_tokens = (self.vision_config["model_args"]["img_size"][0] // 14) ** 2
self.num_video_tokens = (self.vision_config["model_args"]["img_size"][0] // 14) ** 2
self.seq_length = seq_length + self.num_image_tokens
self.encoder_seq_length = self.seq_length
def get_config(self):
return PerceptionLMConfig(
text_config=self.text_config,
vision_config=self.vision_config,
vision_use_cls_token=True,
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
tie_word_embeddings=self.tie_word_embeddings,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
self.num_tiles,
self.num_channels,
self.vision_config["model_args"]["img_size"][0],
self.vision_config["model_args"]["img_size"][1],
]
)
pixel_values_videos = floats_tensor(
[
self.batch_size,
self.num_frames,
self.num_channels,
self.vision_config["model_args"]["img_size"][0],
self.vision_config["model_args"]["img_size"][1],
]
)
config = self.get_config()
return config, pixel_values, pixel_values_videos
def prepare_config_and_inputs_for_common(self):
config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
input_ids[input_ids == config.image_token_id] = self.pad_token_id
input_ids[input_ids == config.video_token_id] = self.pad_token_id
input_ids[:, : self.num_image_tokens] = config.image_token_id
input_ids[:, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens] = config.video_token_id
inputs_dict = {
"pixel_values": pixel_values,
"pixel_values_videos": pixel_values_videos,
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class PerceptionLMForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `PerceptionLMForConditionalGeneration`.
"""
all_model_classes = (
(
PerceptionLMModel,
PerceptionLMForConditionalGeneration,
)
if is_torch_available()
else ()
)
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = PerceptionLMVisionText2TextModelTester(self)
common_properties = [
"image_token_id",
"video_token_id",
]
self.config_tester = ConfigTester(
self,
config_class=PerceptionLMConfig,
has_text_modality=False,
common_properties=common_properties,
)
def test_config(self):
self.config_tester.run_common_tests()
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
del inputs["pixel_values_videos"]
wte = model.get_input_embeddings()
inputs["inputs_embeds"] = wte(input_ids)
with torch.no_grad():
model(**inputs)
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
# while some other models require pixel_values to be present
def test_inputs_embeds_matches_input_ids(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
del inputs["pixel_values_videos"]
inputs_embeds = model.get_input_embeddings()(input_ids)
with torch.no_grad():
out_ids = model(input_ids=input_ids, **inputs)[0]
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
def test_mismatching_num_image_tokens(self):
"""
Tests that VLMs through an error with explicit message saying what is wrong
when number of images doesn't match number of image tokens in the text.
Also we need to test multi-image cases when one prompr has multiple image tokens.
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
if model_class == PerceptionLMModel:
continue
model = model_class(config).to(torch_device)
model.eval()
_ = model(**input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
with self.assertRaises(ValueError):
_ = model(input_ids=input_ids, pixel_values=pixel_values)
# two images and two image tokens don't raise an error
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
_ = model(input_ids=input_ids, pixel_values=pixel_values)
def test_training(self):
self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
super().test_training()
def test_training_gradient_checkpointing(self):
self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
super().test_training_gradient_checkpointing()
def test_training_gradient_checkpointing_use_reentrant(self):
self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
super().test_training_gradient_checkpointing_use_reentrant()
def test_training_gradient_checkpointing_use_reentrant_false(self):
self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
super().test_training_gradient_checkpointing_use_reentrant_false()
@unittest.skip(reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights")
def test_can_init_all_missing_weights(self):
pass
@unittest.skip(reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights")
def test_initialization(self):
pass
@unittest.skip(
reason="PE/TIMM's attention implementation is self configured and won't raise ValueError on global attention implementation."
)
def test_flash_attn_2_can_dispatch_composite_models(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("ViT PE / TimmWrapperModel cannot be tested with meta device")
def test_can_be_initialized_on_meta(self):
pass
@unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
def test_generate_from_inputs_embeds_0_greedy(self):
pass
@unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
def test_generate_from_inputs_embeds_1_beam_search(self):
pass
@unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
def test_generate_from_inputs_embeds_with_static_cache(self):
pass
## Skip flash attention releated tests below
## correct configuration:
## from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2", "vision_config": "eager"}
@unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
def test_eager_matches_fa2_generate(self):
pass
@unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
def test_flash_attn_2_from_config(self):
pass
@unittest.skip("SDPA test is not configured correctly as we need to configure vision/timm model to 'eager'.")
def test_eager_matches_sdpa_generate_with_dynamic_cache(self):
pass
@unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
def test_flash_attn_2_inference_equivalence_right_padding(self):
pass
@unittest.skip("SDPA test is not configured correctly as we need to configure vision/timm model to 'eager'.")
def test_eager_matches_sdpa_generate(self):
pass
@unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
def test_flash_attn_2_inference_equivalence(self):
pass
@unittest.skip(
"PerceptionLMForConditionalGeneration does not have language_model, vision_tower, multi_modal_projector."
)
def test_sdpa_can_dispatch_composite_models(self):
pass
@unittest.skip("Cannot set `output_attentions` for timm models.")
def test_attention_outputs(self):
pass
@unittest.skip("Cannot set `output_attentions` for timm models.")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip("Cannot set `output_attentions` for timm models.")
def test_generate_compilation_all_outputs(self):
pass
TEST_MODEL_PATH = "facebook/Perception-LM-1B"
@require_torch
@require_bitsandbytes
@slow
@require_read_token
class PerceptionLMForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
self.image_file = hf_hub_download(
repo_id="shumingh/perception_lm_test_images",
filename="14496_0.PNG",
repo_type="dataset",
)
self.video_file = hf_hub_download(
repo_id="shumingh/perception_lm_test_videos",
filename="GUWR5TyiY-M_000012_000022.mp4",
repo_type="dataset",
)
self.conversation1 = [
{
"role": "user",
"content": [
{"type": "image", "url": self.image_file},
{"type": "text", "text": "Describe the bar plot in the image."},
],
}
]
self.conversation2 = [
{
"role": "user",
"content": [
{
"type": "video",
"url": self.video_file,
},
{"type": "text", "text": "Can you describe the video in detail?"},
],
}
]
def tearDown(self):
cleanup(torch_device, gc_collect=True)
def test_small_model_integration_test(self):
model = PerceptionLMForConditionalGeneration.from_pretrained(
TEST_MODEL_PATH, load_in_4bit=True, cache_dir="./"
)
inputs = self.processor.apply_chat_template(
[self.conversation1],
num_frames=32,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
padding=True,
padding_side="left",
).to(torch_device)
generate_ids = model.generate(**inputs, max_new_tokens=18)
input_length = inputs["input_ids"].shape[1]
generate_ids_without_inputs = generate_ids[:, input_length:]
EXPECTED_DECODED_TEXT = "The bar plot displays the values of four categories: step, horror, mood, and lumber" # fmt: skip
self.assertEqual(
self.processor.decode(generate_ids_without_inputs[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
def test_small_model_integration_test_batched(self):
model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
inputs = processor.apply_chat_template(
[self.conversation1, self.conversation2],
num_frames=32,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
padding=True,
padding_side="left",
).to(torch_device)
generate_ids = model.generate(**inputs, max_new_tokens=18)
input_length = inputs["input_ids"].shape[1]
generate_ids_without_inputs = generate_ids[:, input_length:]
EXPECTED_DECODED_TEXT = ['The bar plot displays the values of four categories: step, horror, mood, and lumber', 'The video shows a group of people in green shirts and white shorts performing a jump rope routine'] # fmt: skip
self.assertEqual(
processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
def test_generation_no_images(self):
# model_id = "facebook/Perception-LM-1B"
model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
# Prepare inputs with no images
inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)

View File

@@ -0,0 +1,177 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import shutil
import tempfile
import unittest
from transformers import (
AutoProcessor,
AutoTokenizer,
PerceptionLMProcessor,
)
from transformers.testing_utils import require_read_token, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import PerceptionLMImageProcessorFast, PerceptionLMVideoProcessor
if is_torch_available():
import torch
TEST_MODEL_PATH = "facebook/Perception-LM-1B"
@require_vision
@require_read_token
@unittest.skip("Fequires read token and we didn't requests access yet. FIXME @ydshieh when you are back :)")
class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = PerceptionLMProcessor
@classmethod
def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp()
image_processor = PerceptionLMImageProcessorFast(
tile_size=448, max_num_tiles=4, vision_input_type="thumb+tile"
)
video_processor = PerceptionLMVideoProcessor()
tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_PATH)
tokenizer.add_special_tokens({"additional_special_tokens": ["<|image|>", "<|video|>"]})
processor_kwargs = cls.prepare_processor_dict()
processor = PerceptionLMProcessor(
image_processor=image_processor, video_processor=video_processor, tokenizer=tokenizer, **processor_kwargs
)
processor.save_pretrained(cls.tmpdirname)
cls.image_token_id = processor.image_token_id
cls.video_token_id = processor.video_token_id
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@staticmethod
def prepare_processor_dict():
return {
"chat_template": CHAT_TEMPLATE,
"patch_size": 14,
"pooling_ratio": 2,
} # fmt: skip
def test_chat_template_is_saved(self):
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
processor_dict_loaded = json.loads(processor_loaded.to_json_string())
# chat templates aren't serialized to json in processors
self.assertFalse("chat_template" in processor_dict_loaded)
# they have to be saved as separate file and loaded back from that file
# so we check if the same template is loaded
processor_dict = self.prepare_processor_dict()
self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
def test_image_token_filling(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
# Important to check with non square image
image = torch.randn((1, 3, 450, 500))
# 5 tiles (thumbnail tile + 4 tiles)
# 448/patch_size/pooling_ratio = 16 => 16*16 tokens per tile
expected_image_tokens = 16 * 16 * 5
image_token_index = processor.image_token_id
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
inputs = processor(
text=[processor.apply_chat_template(messages)],
images=[image],
return_tensors="pt",
)
image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
self.assertEqual(expected_image_tokens, image_tokens)
self.assertEqual(inputs["pixel_values"].ndim, 5)
def test_vanilla_image_with_no_tiles_token_filling(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
processor.image_processor.vision_input_type = "vanilla"
# Important to check with non square image
image = torch.randn((1, 3, 450, 500))
# 1 tile
# 448/patch_size/pooling_ratio = 16 => 16*16 tokens per tile
expected_image_tokens = 16 * 16 * 1
image_token_index = processor.image_token_id
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
inputs = processor(
text=[processor.apply_chat_template(messages)],
images=[image],
return_tensors="pt",
)
image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
self.assertEqual(expected_image_tokens, image_tokens)
self.assertEqual(inputs["pixel_values"].ndim, 5)
self.assertEqual(inputs["pixel_values"].shape[1], 1) # 1 tile
CHAT_TEMPLATE = (
"{{- bos_token }}"
"{%- if messages[0]['role'] == 'system' -%}"
" {%- set system_message = messages[0]['content']|trim %}\n"
" {%- set messages = messages[1:] %}\n"
"{%- else %}"
" {%- set system_message = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.' %}"
"{%- endif %}"
"{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}"
"{{- system_message }}"
"{{- '<|eot_id|>' }}"
"{%- for message in messages %}"
"{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}"
"{%- for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
"{{ '<|image|>' }}"
"{%- endfor %}"
"{%- for content in message['content'] | selectattr('type', 'equalto', 'video') %}"
"{{ '<|video|>' }}"
"{%- endfor %}"
"{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
"{{- content['text'] | trim }}"
"{%- endfor %}"
"{{'<|eot_id|>' }}"
"{%- endfor %}"
"{%- if add_generation_prompt %}"
"{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
"{%- endif %}"
)

View File

@@ -0,0 +1,124 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torchvision_available, is_vision_available
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
if is_vision_available():
if is_torchvision_available():
from transformers import PerceptionLMVideoProcessor
class PerceptionLMVideoProcessingTester:
def __init__(
self,
parent,
batch_size=5,
num_frames=8,
num_channels=3,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_center_crop=True,
crop_size=None,
do_normalize=True,
image_mean=IMAGENET_STANDARD_MEAN,
image_std=IMAGENET_STANDARD_STD,
do_convert_rgb=True,
):
size = size if size is not None else {"height": 20, "width": 20}
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_frames = num_frames
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_video_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_center_crop": self.do_center_crop,
"crop_size": self.crop_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
}
def expected_output_video_shape(self, images):
return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
videos = prepare_video_inputs(
batch_size=self.batch_size,
num_frames=self.num_frames,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
return_tensors=return_tensors,
)
return videos
@require_torch
@require_vision
class PerceptionLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
fast_video_processing_class = PerceptionLMVideoProcessor if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.video_processor_tester = PerceptionLMVideoProcessingTester(self)
@property
def video_processor_dict(self):
return self.video_processor_tester.prepare_video_processor_dict()
def test_video_processor_properties(self):
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
self.assertTrue(hasattr(video_processing, "do_resize"))
self.assertTrue(hasattr(video_processing, "size"))
self.assertTrue(hasattr(video_processing, "do_center_crop"))
self.assertTrue(hasattr(video_processing, "center_crop"))
self.assertTrue(hasattr(video_processing, "do_normalize"))
self.assertTrue(hasattr(video_processing, "image_mean"))
self.assertTrue(hasattr(video_processing, "image_std"))
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
def test_video_processor_from_dict_with_kwargs(self):
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
self.assertEqual(video_processor.size, {"height": 20, "width": 20})
self.assertEqual(video_processor.crop_size, {"height": 18, "width": 18})
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42, crop_size=84)
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
self.assertEqual(video_processor.crop_size, {"height": 84, "width": 84})