This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,408 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import tempfile
import unittest
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, load_image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
from ...test_processing_common import url_to_local_path
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import Qwen2VLImageProcessor
if is_torchvision_available():
from transformers import Qwen2VLImageProcessorFast
class Qwen2VLImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_frames=10,
min_resolution=56,
max_resolution=1024,
min_pixels=56 * 56,
max_pixels=28 * 28 * 1280,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_resize=True,
patch_size=14,
temporal_patch_size=2,
merge_size=2,
do_convert_rgb=True,
):
self.parent = parent
self.batch_size = batch_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.num_channels = num_channels
self.num_frames = num_frames
self.image_mean = OPENAI_CLIP_MEAN
self.image_std = OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.do_resize = do_resize
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"min_pixels": self.min_pixels,
"max_pixels": self.max_pixels,
"patch_size": self.patch_size,
"temporal_patch_size": self.temporal_patch_size,
"merge_size": self.merge_size,
}
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
images = prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
return [[image] for image in images]
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_video_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
num_frames=self.num_frames,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = Qwen2VLImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "min_pixels"))
self.assertTrue(hasattr(image_processing, "max_pixels"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "patch_size"))
self.assertTrue(hasattr(image_processing, "temporal_patch_size"))
self.assertTrue(hasattr(image_processing, "merge_size"))
def test_image_processor_from_dict_with_kwargs(self):
for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.min_pixels, 56 * 56)
self.assertEqual(image_processor.max_pixels, 28 * 28 * 1280)
image_processor = image_processing_class.from_dict(
self.image_processor_dict, min_pixels=256 * 256, max_pixels=640 * 640
)
self.assertEqual(image_processor.min_pixels, 256 * 256)
self.assertEqual(image_processor.max_pixels, 640 * 640)
def test_select_best_resolution(self):
# Test with a final resize resolution
best_resolution = smart_resize(561, 278, factor=28)
self.assertEqual(best_resolution, (560, 280))
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs:
self.assertIsInstance(image[0], Image.Image)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_numpy(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs:
self.assertIsInstance(image[0], np.ndarray)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_pytorch(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image in image_inputs:
self.assertIsInstance(image[0], torch.Tensor)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
@unittest.skip(reason="Qwen2VLImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
def test_call_numpy_4_channels(self):
pass
def test_nested_input(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
# Test batched as a list of images
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched as a nested list of images, where each sublist is one batch
image_inputs_nested = image_inputs[:3] + image_inputs[3:]
process_out = image_processing(image_inputs_nested, return_tensors="pt")
encoded_images_nested = process_out.pixel_values
image_grid_thws_nested = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all())
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
def test_video_inputs(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
for num_frames, expected_dims in expected_dims_by_frames.items():
image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
process_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = process_out.pixel_values_videos
expected_output_video_shape = (expected_dims, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
def test_custom_patch_size(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
for patch_size in (1, 3, 5, 7):
image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
process_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = process_out.pixel_values_videos
expected_output_video_shape = (171500, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
def test_custom_image_size(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
image_processing.save_pretrained(tmpdirname)
image_processor_loaded = image_processing_class.from_pretrained(
tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28
)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
process_out = image_processor_loaded(image_inputs, return_tensors="pt")
expected_output_video_shape = [112, 1176]
self.assertListEqual(list(process_out.pixel_values.shape), expected_output_video_shape)
def test_custom_pixels(self):
pixel_choices = frozenset(itertools.product((100, 150, 200, 20000), (100, 150, 200, 20000)))
for image_processing_class in self.image_processor_list:
image_processor_dict = self.image_processor_dict.copy()
for a_pixels, b_pixels in pixel_choices:
image_processor_dict["min_pixels"] = min(a_pixels, b_pixels)
image_processor_dict["max_pixels"] = max(a_pixels, b_pixels)
image_processor = image_processing_class(**image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs()
# Just checking that it doesn't raise an error
image_processor(image_inputs, return_tensors="pt")
def test_temporal_padding(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# Create random video inputs with a number of frames not divisible by temporal_patch_size
image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=5, temporal_patch_size=4)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
# Process the video inputs
process_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = process_out.pixel_values_videos
# Check the shape after padding
expected_output_video_shape = (102900, 1176) # Adjusted based on padding
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
# Check divisibility by temporal_patch_size
self.assertEqual(encoded_video.shape[0] % 4, 0)
@require_vision
@require_torch
def test_slow_fast_equivalence(self):
dummy_image = load_image(url_to_local_path("http://images.cocodataset.org/val2017/000000039769.jpg"))
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
)
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
if self.image_processing_class is None or self.fast_image_processing_class is None:
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
self._assert_slow_fast_tensors_equivalence(
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
)
def test_get_num_patches_without_images(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
self.assertEqual(num_patches, 64)
num_patches = image_processing.get_number_of_image_patches(height=200, width=50, images_kwargs={})
self.assertEqual(num_patches, 56)
num_patches = image_processing.get_number_of_image_patches(
height=100, width=100, images_kwargs={"patch_size": 28}
)
self.assertEqual(num_patches, 16)

View File

@@ -0,0 +1,619 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Qwen2-VL model."""
import copy
import gc
import tempfile
import unittest
import requests
from transformers import (
AutoProcessor,
Qwen2VLConfig,
Qwen2VLForConditionalGeneration,
Qwen2VLModel,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
Expectations,
backend_empty_cache,
require_flash_attn,
require_torch,
require_torch_gpu,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
_config_zero_init,
floats_tensor,
ids_tensor,
)
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class Qwen2VLVisionText2TextModelTester:
def __init__(
self,
parent,
batch_size=3,
seq_length=7,
num_channels=3,
ignore_index=-100,
image_size=14,
bos_token_id=0,
eos_token_id=1,
pad_token_id=2,
vision_start_token_id=3,
image_token_id=4,
video_token_id=5,
hidden_act="silu",
hidden_size=32,
vocab_size=99,
intermediate_size=37,
max_position_embeddings=512,
max_window_layers=3,
model_type="qwen2_vl",
num_attention_heads=4,
num_hidden_layers=2,
num_key_value_heads=2,
rope_theta=10000,
tie_word_embeddings=True,
is_training=True,
vision_config={
"depth": 2,
"embed_dim": 32,
"hidden_act": "quick_gelu",
"hidden_size": 32,
"mlp_ratio": 4,
"num_heads": 4,
"patch_size": 14,
"spatial_merge_size": 1,
"temporal_patch_size": 2,
},
rope_scaling={"type": "mrope", "mrope_section": [2, 1, 1]},
):
self.parent = parent
self.ignore_index = ignore_index
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.vision_start_token_id = vision_start_token_id
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.hidden_act = hidden_act
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.max_position_embeddings = max_position_embeddings
self.max_window_layers = max_window_layers
self.model_type = model_type
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.num_key_value_heads = num_key_value_heads
self.rope_theta = rope_theta
self.tie_word_embeddings = tie_word_embeddings
self.vision_config = vision_config
self.rope_scaling = rope_scaling
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.is_training = is_training
self.vocab_size = vocab_size
self.num_image_tokens = 32
self.seq_length = seq_length + self.num_image_tokens
def get_config(self):
return Qwen2VLConfig(
hidden_size=self.hidden_size,
intermediate_size=self.intermediate_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
num_key_value_heads=self.num_key_value_heads,
hidden_act=self.hidden_act,
max_position_embeddings=self.max_position_embeddings,
vision_config=self.vision_config,
model_type=self.model_type,
max_window_layers=self.max_window_layers,
rope_scaling=self.rope_scaling,
tie_word_embeddings=self.tie_word_embeddings,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
vision_start_token_id=self.vision_start_token_id,
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
vocab_size=self.vocab_size,
)
def prepare_config_and_inputs(self):
config = self.get_config()
patch_size = config.vision_config.patch_size
temporal_patch_size = config.vision_config.temporal_patch_size
pixel_values = floats_tensor(
[
self.batch_size * (self.image_size**2) // (patch_size**2),
self.num_channels * (patch_size**2) * temporal_patch_size,
]
)
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
input_ids[:, -1] = self.pad_token_id
attention_mask[:, -1] = 0
input_ids[input_ids == self.video_token_id] = self.pad_token_id
input_ids[input_ids == self.image_token_id] = self.pad_token_id
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
inputs_dict = {
"pixel_values": pixel_values,
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size, device=torch_device),
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `Qwen2VLForConditionalGeneration`.
"""
all_model_classes = (
(
Qwen2VLModel,
Qwen2VLForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = Qwen2VLVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
def test_mismatching_num_image_tokens(self):
"""
Tests that VLMs through an error with explicit message saying what is wrong
when number of images don't match number of image tokens in the text.
Also we need to test multi-image cases when one prompt has multiple image tokens.
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
model.eval()
curr_input_dict = copy.deepcopy(input_dict)
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
patch_size = config.vision_config.patch_size
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
with self.assertRaises(ValueError):
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw)
# two images and two image tokens don't raise an error
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
image_grid_thw = torch.cat([image_grid_thw, image_grid_thw], dim=0)
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw)
def test_forward_with_rope_deltas_cached(self):
"""
Tests that Qwen2-VL computes new rope deltas every forward pass with new set of inputs.
Rope deltas are cached when we generate and re-used for decoding phase, byt are not reset
automatically after generation ends. See https://github.com/huggingface/transformers/pull/36013 for more
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config).to(torch_device)
# Generate and make sure rope_deltas are not `None`
self.assertTrue(model.model.rope_deltas is None)
generation_output = model.generate(
**input_dict, max_new_tokens=4, return_dict_in_generate=True, output_logits=True
)
self.assertTrue(model.model.rope_deltas is not None)
# Now if we try to do forward pass, we should get new rope logits, because cache is not passed
forward_output = model(**input_dict)
torch.testing.assert_close(
generation_output.logits[0], forward_output.logits[:, -1, :], rtol=1e-4, atol=1e-4
)
def attention_mask_padding_matches_padding_free_with_position_ids(
self, attn_implementation: str, fa_kwargs: bool = False
):
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.float16]:
dummy_input = dummy_input.to(torch.bfloat16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
if 0 in inputs_dict["attention_mask"][:, -1]:
inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1)
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.get_text_config().pad_token_id
model = (
model_class.from_pretrained(
tmpdirname,
dtype=torch.bfloat16,
attn_implementation=attn_implementation,
)
.to(torch_device)
.eval()
)
# flatten
padfree_inputs_dict = {
"pixel_values": inputs_dict["pixel_values"],
"image_grid_thw": inputs_dict["image_grid_thw"],
"input_ids": inputs_dict["input_ids"][dummy_attention_mask.bool()].unsqueeze(0),
}
# add position_ids
vision_position_ids, deltas = model.model.get_rope_index(
input_ids=inputs_dict["input_ids"],
image_grid_thw=inputs_dict["image_grid_thw"],
attention_mask=inputs_dict["attention_mask"],
) # [3, bs, padded-seq-len]
vision_padfree_positions = vision_position_ids[:, dummy_attention_mask.bool()].view(
3, -1
) # [3, bs*padfree-len]
text_padfree_positions = torch.cat(
[torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()]
) # [1, bs*padfree-len]
text_padfree_positions = text_padfree_positions.long().unsqueeze(0).to(torch_device)
padfree_inputs_dict["position_ids"] = torch.cat([text_padfree_positions, vision_padfree_positions])[
:, None, :
]
if fa_kwargs:
cu_seq_lens = [0] + dummy_attention_mask.sum(1).tolist()
cu_seq_lens = torch.tensor(cu_seq_lens, device=torch_device)
max_length = cu_seq_lens.diff().max().item()
padfree_inputs_dict.update(
{
"cu_seq_lens_q": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"cu_seq_lens_k": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"max_length_q": max_length,
"max_length_k": max_length,
}
)
# We need to do simple forward without cache in roder to trigger packed SDPA/FLEX/EAGER path
res_padded = model(**inputs_dict, use_cache=False)
res_padfree = model(**padfree_inputs_dict, use_cache=False)
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]
# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
@unittest.skip(reason="Feedforward chunking is not yet supported")
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="CPU offload is not yet supported")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_disk_offload_safetensors(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_model_parallelism(self):
pass
@unittest.skip(reason="Compile not yet supported because in Qwen2VL models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
def test_multi_gpu_data_parallel_forward(self):
pass
@require_torch
class Qwen2VLIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
self.messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What kind of dog is this?"},
],
}
]
url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
self.image = Image.open(requests.get(url, stream=True).raw)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
@slow
def test_small_model_integration_test(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt")
expected_input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655] # fmt: skip
assert expected_input_ids == inputs.input_ids[0].tolist()[:17]
expected_pixel_slice = torch.tensor(
[
[0.8792, 0.8792, 0.9084],
[1.1858, 1.1858, 1.2296],
[1.2004, 1.2004, 1.2150],
[1.4340, 1.4340, 1.4194],
[1.3902, 1.4048, 1.4194],
[1.5216, 1.5362, 1.5362],
],
dtype=torch.float32,
device="cpu",
)
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
# verify generation
inputs = inputs.to(torch_device)
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices"
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_expand(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch_wo_image(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who are you?"},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am a large language model created by Alibaba Cloud. I am called Qwen.'
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch_different_resolutions(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
image2 = self.image.resize((224, 224))
inputs = self.processor(text=[text, text2], images=[self.image, image2], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_DECODED_TEXTS = Expectations(
{
("xpu", 3): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
],
("cuda", None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets',
],
("cuda", 8): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices'
],
}
) # fmt: skip
EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT)
@slow
@require_flash_attn
@require_torch_gpu
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
@require_flash_attn
@require_torch_gpu
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who are you?"},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am a large language model created by Alibaba Cloud. I am called Qwen.'
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)

View File

@@ -0,0 +1,395 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import shutil
import tempfile
import unittest
import numpy as np
import pytest
from transformers import AutoProcessor, Qwen2TokenizerFast
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_vision_available():
from transformers import Qwen2VLProcessor
if is_torchvision_available():
from transformers import Qwen2VLImageProcessorFast, Qwen2VLVideoProcessor
if is_torch_available():
import torch
@require_vision
@require_torch
@require_torchvision
class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Qwen2VLProcessor
@classmethod
def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp()
processor = Qwen2VLProcessor.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
)
processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def get_video_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
# Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens
def test_get_num_vision_tokens(self):
"Tests general functionality of the helper used internally in vLLM"
processor = self.get_processor()
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
self.assertTrue("num_image_tokens" in output)
self.assertEqual(len(output["num_image_tokens"]), 3)
self.assertTrue("num_image_patches" in output)
self.assertEqual(len(output["num_image_patches"]), 3)
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
image_processor = self.get_image_processor()
video_processor = self.get_video_processor()
processor = Qwen2VLProcessor(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
processor.save_pretrained(self.tmpdirname)
processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
video_processor = self.get_video_processor()
processor = Qwen2VLProcessor(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
image_input = self.prepare_image_inputs()
input_image_proc = image_processor(image_input, return_tensors="pt")
input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
for key in input_image_proc:
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
video_processor = self.get_video_processor()
processor = Qwen2VLProcessor(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values", "image_grid_thw"])
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
# test if it raises when no text is passed
with pytest.raises(TypeError):
processor(images=image_input)
@require_torch
@require_av
def _test_apply_chat_template(
self,
modality: str,
batch_size: int,
return_tensors: str,
input_name: str,
processor_name: str,
input_data: list[str],
):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if processor_name not in self.processor_class.attributes:
self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
batch_messages = [
[
{
"role": "user",
"content": [{"type": "text", "text": "Describe this."}],
},
]
] * batch_size
# Test that jinja can be applied
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), batch_size)
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(
formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
# Test that kwargs passed to processor's `__call__` are actually used
tokenized_prompt_100 = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
padding="max_length",
truncation=True,
return_tensors=return_tensors,
max_length=100,
)
self.assertEqual(len(tokenized_prompt_100[0]), 100)
# Test that `return_dict=True` returns text related inputs in the dict
out_dict_text = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
)
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(input_data[:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
out_dict = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
if modality == "video":
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
else:
mm_len = batch_size * 192
self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict:
self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])
@require_av
def test_apply_chat_template_video_frame_sampling(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
messages = [
[
{
"role": "user",
"content": [
{"type": "video"},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), 1)
formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
self.assertListEqual(expected_output, formatted_prompt_tokenized)
out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
# Add video URL for return dict and load with `num_frames` arg
messages[0][0]["content"][0] = {
"type": "video",
"url": url_to_local_path(
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
),
}
num_frames = 3
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
num_frames=num_frames,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `fps` arg
fps = 1
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
num_frames=num_frames,
)
# Load without any arg should load the whole video
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video
messages[0][0]["content"][0] = {
"type": "video",
"url": [
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
],
}
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
# When the inputs are frame URLs/paths we expect that those are already
# sampled and will raise an error is asked to sample again.
with self.assertRaisesRegex(
ValueError, "Sampling frames from a list of images is not supported! Set `do_sample_frames=False`"
):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
do_sample_frames=True,
)
def test_kwargs_overrides_custom_image_processor_kwargs(self):
processor = self.get_processor()
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=20,
)

View File

@@ -0,0 +1,345 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers.image_utils import get_image_size
from transformers.models.qwen2_vl.video_processing_qwen2_vl import smart_resize
if is_torchvision_available():
from transformers import Qwen2VLVideoProcessor
class Qwen2VLVideoProcessingTester:
def __init__(
self,
parent,
batch_size=5,
num_frames=8,
num_channels=3,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_convert_rgb=True,
temporal_patch_size=2,
patch_size=14,
min_pixels=20 * 20,
max_pixels=100 * 100,
merge_size=2,
):
size = size if size is not None else {"shortest_edge": 20}
self.parent = parent
self.batch_size = batch_size
self.num_frames = num_frames
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
self.temporal_patch_size = temporal_patch_size
self.patch_size = patch_size
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.merge_size = merge_size
def prepare_video_processor_dict(self):
return {
"do_resize": self.do_resize,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
"temporal_patch_size": self.temporal_patch_size,
"patch_size": self.patch_size,
"min_pixels": self.min_pixels,
"max_pixels": self.max_pixels,
"merge_size": self.merge_size,
}
@require_vision
def expected_output_video_shape(self, videos, num_frames=None):
num_frames = num_frames if num_frames is not None else self.num_frames
grid_t = num_frames // self.temporal_patch_size
hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
seq_len = 0
for video in videos:
if isinstance(video[0], Image.Image):
video = np.stack([np.array(frame) for frame in video])
height, width = get_image_size(video)
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
seq_len += grid_t * grid_h * grid_w
return [seq_len, hidden_dim]
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
videos = prepare_video_inputs(
batch_size=self.batch_size,
num_frames=self.num_frames,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
return_tensors=return_tensors,
)
return videos
@require_torch
@require_vision
class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
fast_video_processing_class = Qwen2VLVideoProcessor if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.video_processor_tester = Qwen2VLVideoProcessingTester(self)
@property
def video_processor_dict(self):
return self.video_processor_tester.prepare_video_processor_dict()
def test_video_processor_properties(self):
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
self.assertTrue(hasattr(video_processing, "do_resize"))
self.assertTrue(hasattr(video_processing, "size"))
self.assertTrue(hasattr(video_processing, "do_normalize"))
self.assertTrue(hasattr(video_processing, "image_mean"))
self.assertTrue(hasattr(video_processing, "image_std"))
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
# OVERRIDDEN BECAUSE QWEN2_VL HAS SPECIAL OUTPUT SHAPES
def test_video_processor_from_dict_with_kwargs(self):
for video_processing_class in self.video_processor_list:
video_processor = video_processing_class(**self.video_processor_dict)
self.assertEqual(video_processor.min_pixels, self.video_processor_tester.min_pixels)
self.assertEqual(video_processor.max_pixels, self.video_processor_tester.max_pixels)
video_processor = video_processing_class.from_dict(
self.video_processor_dict, min_pixels=256 * 256, max_pixels=640 * 640
)
self.assertEqual(video_processor.min_pixels, 256 * 256)
self.assertEqual(video_processor.max_pixels, 640 * 640)
def test_call_pil(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="pil"
)
# Each video is a list of PIL Images
for video in video_inputs:
self.assertIsInstance(video[0], Image.Image)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_numpy(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
# create random numpy tensors
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
for video in video_inputs:
self.assertIsInstance(video, np.ndarray)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_pytorch(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
# create random PyTorch tensors
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="torch"
)
for video in video_inputs:
self.assertIsInstance(video, torch.Tensor)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
self.assertEqual(
list(encoded_videos.shape),
expected_output_video_shape,
)
def test_nested_input(self):
"""Tests that the processor can work with nested list where each video is a list of arrays"""
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
# Test not batched input
video_inputs_nested = [list(video) for video in video_inputs]
encoded_videos = video_processing(video_inputs_nested[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
encoded_videos = video_processing(video_inputs_nested, return_tensors="pt")[self.input_name]
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@unittest.skip("Skip for now, the test needs adjustment fo Qwen2VL")
def test_call_numpy_4_channels(self):
for video_processing_class in self.video_processor_list:
# Test that can process videos which have an arbitrary number of channels
# Initialize video_processing
video_processor = video_processing_class(**self.video_processor_dict)
# create random numpy tensors
self.video_processor_tester.num_channels = 4
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
# Test not batched input
encoded_videos = video_processor(
video_inputs[0],
return_tensors="pt",
input_data_format="channels_last",
image_mean=0,
image_std=1,
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processor(
video_inputs,
return_tensors="pt",
input_data_format="channels_last",
image_mean=0,
image_std=1,
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=4)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=4)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=4
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=4
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
self.input_name
]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=6
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=6
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
self.input_name
]
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames