init
This commit is contained in:
@@ -0,0 +1,274 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import load_image
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
from ...test_processing_common import url_to_local_path
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DeepseekVLHybridImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import DeepseekVLHybridImageProcessorFast
|
||||
|
||||
|
||||
class DeepseekVLHybridImageProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=400,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
high_res_size=None,
|
||||
do_normalize=True,
|
||||
image_mean=[0.5, 0.5, 0.5],
|
||||
image_std=[0.5, 0.5, 0.5],
|
||||
high_res_image_mean=[0.5, 0.5, 0.5],
|
||||
high_res_image_std=[0.5, 0.5, 0.5],
|
||||
):
|
||||
size = size if size is not None else {"height": 18, "width": 18}
|
||||
high_res_size = high_res_size if high_res_size is not None else {"height": 36, "width": 36}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.high_res_size = high_res_size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.high_res_image_mean = high_res_image_mean
|
||||
self.high_res_image_std = high_res_image_std
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"high_res_image_mean": self.high_res_image_mean,
|
||||
"high_res_image_std": self.high_res_image_std,
|
||||
"do_normalize": self.do_normalize,
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"high_res_size": self.high_res_size,
|
||||
}
|
||||
|
||||
def expected_output_image_shape(self, images):
|
||||
max_size = max(self.size["height"], self.size["width"])
|
||||
return self.num_channels, max_size, max_size
|
||||
|
||||
def expected_output_high_res_image_shape(self, images):
|
||||
max_size = max(self.high_res_size["height"], self.high_res_size["width"])
|
||||
return self.num_channels, max_size, max_size
|
||||
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
return prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = DeepseekVLHybridImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = DeepseekVLHybridImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.image_processor_dict with ViT->DeepseekVLHybrid
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.test_image_processor_from_dict_with_kwargs
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
|
||||
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "high_res_image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "high_res_image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
self.assertTrue(hasattr(image_processing, "high_res_size"))
|
||||
|
||||
def test_call_pil_high_res(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL images
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
|
||||
[image_inputs[0]]
|
||||
)
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
|
||||
image_inputs
|
||||
)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
|
||||
def test_call_numpy_high_res(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
|
||||
[image_inputs[0]]
|
||||
)
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
|
||||
image_inputs
|
||||
)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
|
||||
def test_call_pytorch_high_res(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
|
||||
[image_inputs[0]]
|
||||
)
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
|
||||
image_inputs
|
||||
)
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape),
|
||||
(self.image_processor_tester.batch_size, *expected_output_image_shape),
|
||||
)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
dummy_image = load_image(url_to_local_path("http://images.cocodataset.org/val2017/000000039769.jpg"))
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
encoding_slow.high_res_pixel_values, encoding_fast.high_res_pixel_values
|
||||
)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
|
||||
self.skipTest(
|
||||
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
|
||||
)
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
|
||||
|
||||
# Overwrite as the outputs are not always all of the same shape (kept for BC)
|
||||
for i in range(len(encoding_slow.pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
|
||||
)
|
||||
for i in range(len(encoding_slow.high_res_pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.high_res_pixel_values[i]), encoding_fast.high_res_pixel_values[i]
|
||||
)
|
||||
|
||||
@unittest.skip(reason="Not supported")
|
||||
def test_call_numpy_4_channels(self):
|
||||
pass
|
||||
@@ -0,0 +1,404 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch DeepseekVLHybrid model."""
|
||||
|
||||
import re
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
DeepseekVLHybridConfig,
|
||||
DeepseekVLHybridForConditionalGeneration,
|
||||
DeepseekVLHybridModel,
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class DeepseekVLHybridModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=2,
|
||||
seq_length=25,
|
||||
num_channels=3,
|
||||
initializer_range=0.02,
|
||||
is_training=True,
|
||||
use_cache=False,
|
||||
text_config={
|
||||
"num_hidden_layers": 2,
|
||||
"vocab_size": 99,
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 37,
|
||||
"max_position_embeddings": 512,
|
||||
"num_attention_heads": 4,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
vision_config={
|
||||
"num_hidden_layers": 1,
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 37,
|
||||
"image_size": 32,
|
||||
"patch_size": 8,
|
||||
"hidden_act": "gelu",
|
||||
"vision_use_head": False,
|
||||
"num_attention_heads": 4,
|
||||
},
|
||||
high_res_vision_config={
|
||||
"num_hidden_layers": 2,
|
||||
"global_attn_indexes": [0],
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 37,
|
||||
"mlp_dim": 24,
|
||||
"output_channels": 4,
|
||||
"image_size": 128,
|
||||
"patch_size": 32,
|
||||
"num_attention_heads": 4,
|
||||
},
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.num_channels = num_channels
|
||||
self.initializer_range = initializer_range
|
||||
self.is_training = is_training
|
||||
self.use_cache = use_cache
|
||||
|
||||
self.text_config = text_config
|
||||
self.vision_config = vision_config
|
||||
self.high_res_vision_config = high_res_vision_config
|
||||
self.vision_config["num_channels"] = self.num_channels
|
||||
self.high_res_vision_config["num_channels"] = self.num_channels
|
||||
|
||||
self.num_hidden_layers = text_config["num_hidden_layers"]
|
||||
self.vocab_size = text_config["vocab_size"]
|
||||
self.hidden_size = text_config["hidden_size"]
|
||||
self.num_attention_heads = text_config["num_attention_heads"]
|
||||
self.high_res_image_size = high_res_vision_config["image_size"]
|
||||
self.image_size = vision_config["image_size"]
|
||||
self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"]
|
||||
self.pad_token_id = text_config["pad_token_id"]
|
||||
self.image_token_id = self.vocab_size - 1
|
||||
|
||||
def get_config(self):
|
||||
return DeepseekVLHybridConfig(
|
||||
text_config=self.text_config,
|
||||
vision_config=self.vision_config,
|
||||
high_res_vision_config=self.high_res_vision_config,
|
||||
image_token_id=self.image_token_id,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
config = self.get_config()
|
||||
|
||||
# create text and vision inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
|
||||
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
pixel_values = floats_tensor(
|
||||
[
|
||||
self.batch_size,
|
||||
self.num_channels,
|
||||
self.image_size,
|
||||
self.image_size,
|
||||
]
|
||||
)
|
||||
high_res_pixel_values = floats_tensor(
|
||||
[
|
||||
self.batch_size,
|
||||
self.num_channels,
|
||||
self.high_res_image_size,
|
||||
self.high_res_image_size,
|
||||
]
|
||||
)
|
||||
# fill image_tokens
|
||||
input_ids[:, : self.num_image_tokens] = self.image_token_id
|
||||
|
||||
return config, input_ids, attention_mask, pixel_values, high_res_pixel_values
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, input_ids, attention_mask, pixel_values, high_res_pixel_values = config_and_inputs
|
||||
inputs_dict = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"pixel_values": pixel_values,
|
||||
"high_res_pixel_values": high_res_pixel_values,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class DeepseekVLHybridModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (
|
||||
(DeepseekVLHybridModel, DeepseekVLHybridForConditionalGeneration) if is_torch_available() else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"feature-extraction": DeepseekVLHybridModel,
|
||||
"image-text-to-text": DeepseekVLHybridForConditionalGeneration,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
_is_composite = True
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = DeepseekVLHybridModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=DeepseekVLHybridConfig, has_text_modality=False)
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||
def test_inputs_embeds(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
del inputs["high_res_pixel_values"]
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
inputs["inputs_embeds"] = wte(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
model(**inputs)
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs.
|
||||
def test_inputs_embeds_matches_input_ids(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
del inputs["high_res_pixel_values"]
|
||||
|
||||
inputs_embeds = model.get_input_embeddings()(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
out_ids = model(input_ids=input_ids, **inputs)[0]
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
@unittest.skip(reason="Siglip uses a non-standard initialization scheme")
|
||||
# Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
|
||||
def test_initialization(self):
|
||||
pass
|
||||
|
||||
def test_sdpa_can_dispatch_composite_models(self):
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
|
||||
# Load the model with SDPA
|
||||
model_sdpa = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
attn_implementation="sdpa",
|
||||
)
|
||||
model_sdpa = model_sdpa.eval().to(torch_device)
|
||||
|
||||
# Load model with eager attention
|
||||
model_eager = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
attn_implementation="eager",
|
||||
)
|
||||
model_eager = model_eager.eval().to(torch_device)
|
||||
|
||||
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
|
||||
self.assertTrue(model_eager.config._attn_implementation == "eager")
|
||||
|
||||
if (
|
||||
hasattr(model_sdpa, "vision_model")
|
||||
and hasattr(model_sdpa, "high_res_vision_model")
|
||||
and hasattr(model_sdpa, "language_model")
|
||||
):
|
||||
self.assertTrue(model_sdpa.language_model.config._attn_implementation == "sdpa")
|
||||
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
|
||||
self.assertTrue(model_sdpa.high_res_vision_model.config._attn_implementation == "sdpa")
|
||||
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
|
||||
self.assertTrue(model_eager.high_res_vision_model.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_eager.named_modules():
|
||||
class_name = submodule.__class__.__name__
|
||||
if (
|
||||
any(re.finditer(r"Attention(?!Pool)", class_name))
|
||||
and getattr(submodule, "config", None)
|
||||
and submodule.config._attn_implementation == "sdpa"
|
||||
):
|
||||
self.assertTrue(submodule.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_sdpa.named_modules():
|
||||
class_name = submodule.__class__.__name__
|
||||
if (
|
||||
any(re.finditer(r"Attention(?!Pool)", class_name))
|
||||
and getattr(submodule, "config", None)
|
||||
and submodule.config._attn_implementation == "eager"
|
||||
):
|
||||
self.assertTrue(submodule.config._attn_implementation == "sdpa")
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
class DeepseekVLHybridIntegrationTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.model_id = "deepseek-community/deepseek-vl-7b-chat"
|
||||
|
||||
def test_model_text_generation(self):
|
||||
model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
|
||||
self.model_id, dtype="auto", device_map="auto"
|
||||
)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
processor = AutoProcessor.from_pretrained(self.model_id)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||
},
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
],
|
||||
}
|
||||
]
|
||||
EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The' # fmt: skip
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||
)
|
||||
inputs = inputs.to(model.device, dtype=model.dtype)
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
text = processor.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(
|
||||
text,
|
||||
EXPECTED_TEXT,
|
||||
)
|
||||
|
||||
def test_model_text_generation_batched(self):
|
||||
model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
|
||||
self.model_id, dtype="auto", device_map="auto"
|
||||
)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
processor = AutoProcessor.from_pretrained(self.model_id)
|
||||
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||
},
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||
},
|
||||
{"type": "text", "text": "What animal do you see in the image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
]
|
||||
EXPECTED_TEXT = [
|
||||
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The", # fmt: skip
|
||||
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a large, furry animal that appears to be a type of bear.The ", # fmt: skip
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt"
|
||||
)
|
||||
inputs = inputs.to(model.device, dtype=model.dtype)
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
text = processor.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(EXPECTED_TEXT, text)
|
||||
|
||||
def test_model_text_generation_with_multi_image(self):
|
||||
model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
|
||||
self.model_id, dtype="auto", device_map="auto"
|
||||
)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
processor = AutoProcessor.from_pretrained(self.model_id)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's the difference between"},
|
||||
{"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
|
||||
{"type": "text", "text": " and "},
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image shows a street scene with a prominent red stop sign in the foreground. The sign has the" # fmt: skip
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||
)
|
||||
inputs = inputs.to(model.device, dtype=model.dtype)
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
text = processor.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(
|
||||
text,
|
||||
EXPECTED_TEXT,
|
||||
)
|
||||
@@ -0,0 +1,58 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import DeepseekVLHybridProcessor, LlamaTokenizer
|
||||
from transformers.testing_utils import get_tests_dir
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import DeepseekVLHybridImageProcessor
|
||||
|
||||
|
||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||
|
||||
|
||||
class DeepseekVLHybridProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = DeepseekVLHybridProcessor
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
image_processor = DeepseekVLHybridImageProcessor()
|
||||
tokenizer = LlamaTokenizer(
|
||||
vocab_file=SAMPLE_VOCAB,
|
||||
extra_special_tokens={
|
||||
"pad_token": "<|end▁of▁sentence|>",
|
||||
"image_token": "<image_placeholder>",
|
||||
},
|
||||
)
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
processor = self.processor_class(
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}{% set i = 0 %}You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\n{% for message in messages %}{% if message['role']|lower == 'user' %}User: {% elif message['role']|lower == 'assistant' %}Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}{% else %}{{ message['role'].capitalize() }}: {% endif %}{% for content in message['content'] %}{% if content['type'] == 'image' %}<image_placeholder>{% elif content['type'] == 'text' %}{% set text = content['text'] %}{% if loop.first %}{% set text = text.lstrip() %}{% endif %}{% if loop.last %}{% set text = text.rstrip() %}{% endif %}{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}{{ ' ' + text }}{% else %}{{ text }}{% endif %}{% endif %}{% endfor %}{% if not loop.last or add_generation_prompt %}{% if message['role']|lower == 'user' %}{{ seps[0] }}{% else %}{{ seps[1] }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}Assistant:{% endif %}",
|
||||
"num_image_tokens": 576,
|
||||
} # fmt: skip
|
||||
Reference in New Issue
Block a user