init

2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions
--- a/transformers/tests/models/janus/init.py
+++ b/transformers/tests/models/janus/init.py
--- a/transformers/tests/models/janus/test_image_processing_janus.py
+++ b/transformers/tests/models/janus/test_image_processing_janus.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import JanusImageProcessor
+
+    if is_torchvision_available():
+        from transformers import JanusImageProcessorFast
+
+
+class JanusImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=384,
+        min_resolution=30,
+        max_resolution=200,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 384, "width": 384}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = JanusImageProcessor if is_vision_available() else None
+    fast_image_processing_class = JanusImageProcessorFast if is_torchvision_available() else None
+
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Janus
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = JanusImageProcessingTester(self)
+
+    @property
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"height": 384, "width": 384})
+            self.assertEqual(image_processor.image_mean, [0.48145466, 0.4578275, 0.40821073])
+
+            image_processor = image_processing_class.from_dict(
+                self.image_processor_dict, size=42, image_mean=[1.0, 2.0, 1.0]
+            )
+            self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+            self.assertEqual(image_processor.image_mean, [1.0, 2.0, 1.0])
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test Non batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_nested_input(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+
+            # Test batched as a list of images.
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched as a nested list of images, where each sublist is one batch.
+            image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
+            encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 384, 384)
+            self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
+
+            # Image processor should return same pixel values, independently of input format.
+            self.assertTrue((encoded_images_nested == encoded_images).all())
+
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
+            )
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
+        encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
+
+        # Overwrite as the outputs are not always all of the same shape (kept for BC)
+        for i in range(len(encoding_slow.pixel_values)):
+            self._assert_slow_fast_tensors_equivalence(
+                torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
+            )
+
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_postprocess(self):
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        dummy_images = [image / 255.0 for image in dummy_images]
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow.postprocess(dummy_images, return_tensors=None)
+        encoding_fast = image_processor_fast.postprocess(dummy_images, return_tensors=None)
+
+        # Overwrite as the outputs are not always all of the same shape (kept for BC)
+        for i in range(len(encoding_slow.pixel_values)):
+            self._assert_slow_fast_tensors_equivalence(
+                torch.from_numpy(encoding_slow.pixel_values[i]).float(), encoding_fast.pixel_values[i].float()
+            )
+
+    @unittest.skip(reason="Not supported")
+    def test_call_numpy_4_channels(self):
+        pass
--- a/transformers/tests/models/janus/test_modeling_janus.py
+++ b/transformers/tests/models/janus/test_modeling_janus.py
@@ -0,0 +1,534 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Janus model."""
+
+import re
+import tempfile
+import unittest
+from functools import reduce
+
+import numpy as np
+import pytest
+import requests
+
+from transformers import (
+    AutoProcessor,
+    JanusConfig,
+    JanusForConditionalGeneration,
+    JanusModel,
+    JanusVQVAE,
+    JanusVQVAEConfig,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.models.auto import get_values
+from transformers.models.auto.modeling_auto import MODEL_FOR_BACKBONE_MAPPING_NAMES, MODEL_MAPPING_NAMES
+from transformers.testing_utils import (
+    Expectations,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class JanusVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        image_token_index=0,
+        seq_length=25,
+        initializer_range=0.02,
+        text_config={
+            "model_type": "llama",
+            "seq_length": 7,
+            "is_training": True,
+            "use_input_mask": True,
+            "use_token_type_ids": False,
+            "use_labels": True,
+            "vocab_size": 99,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "type_vocab_size": 16,
+            "type_sequence_label_size": 2,
+            "initializer_range": 0.02,
+            "num_labels": 3,
+            "num_choices": 4,
+            "pad_token_id": 1,
+        },
+        is_training=True,
+        vision_config={
+            "use_labels": True,
+            "image_size": 20,
+            "patch_size": 5,
+            "num_image_tokens": 16,
+            "num_channels": 3,
+            "is_training": True,
+            "hidden_size": 32,
+            "projection_dim": 32,
+            "num_key_value_heads": 1,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "mlp_ratio": 2,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+            "vision_feature_select_strategy": "default",
+            "vision_feature_layer": -1,
+        },
+        use_cache=False,
+        vq_num_embeds=12,
+        vq_embed_dim=12,
+        vq_channel_multiplier=[1, 1],
+    ):
+        self.parent = parent
+        self.initializer_range = initializer_range
+        # `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
+        self.image_token_index = image_token_index
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.seq_length = seq_length
+        self.pad_token_id = text_config["pad_token_id"]
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.is_training = is_training
+
+        self.batch_size = 3
+        self.num_channels = vision_config["num_channels"]
+        self.image_size = vision_config["image_size"]
+        self.num_image_tokens = vision_config["num_image_tokens"]
+        self.use_cache = use_cache
+
+        # vq model params
+        self.vq_num_embeds = vq_num_embeds
+        self.vq_embed_dim = vq_embed_dim
+        self.vq_channel_multiplier = vq_channel_multiplier
+
+    def get_vq_config(self):
+        return {
+            "embed_dim": self.vq_embed_dim,
+            "num_embeddings": self.vq_num_embeds,
+            "latent_channels": self.vq_embed_dim,
+            "in_channels": 3,
+            "base_channels": 32,  # we have a GroupNorm of 32 groups, so can't do less
+            "channel_multiplier": self.vq_channel_multiplier,
+            "initializer_range": self.initializer_range,
+            "projection_dim": 10,
+            "image_token_embed_dim": 32,  # Same as text model hidden size
+        }
+
+    def get_config(self):
+        return JanusConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            vq_config=self.get_vq_config(),
+            image_token_id=self.image_token_index,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                3,
+                self.image_size,
+                self.image_size,
+            ]
+        )
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+        attention_mask = input_ids.ne(self.pad_token_id).to(torch_device)
+
+        # set the 16 first tokens to be image, and ensure that no other tokens are image tokens
+        # do not change this unless you modified image size or patch size
+        input_ids[input_ids == self.image_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = self.image_token_index
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": input_ids,
+            "generation_mode": "text",  # Required to perform text generation instead of image generation.
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class JanusVisionText2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (JanusModel, JanusForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (JanusForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = JanusVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=JanusConfig, has_text_modality=False)
+
+    def test_sdpa_can_dispatch_composite_models(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # Load the model with SDPA
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+            # SigLip has one shared cls attr for all models, so we assign both submodels heer
+            vision_attn = language_attn = "sdpa" if model._supports_sdpa else "eager"
+
+            if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "language_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
+                self.assertTrue(model_sdpa.language_model.config._attn_implementation == language_attn)
+                self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
+
+            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+            self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+            for name, submodule in model_eager.named_modules():
+                class_name = submodule.__class__.__name__
+                if any(re.finditer(r"Attention(?!Pool)", class_name)):
+                    self.assertTrue(submodule.config._attn_implementation == "eager")
+
+            for name, submodule in model_sdpa.named_modules():
+                class_name = submodule.__class__.__name__
+                if any(re.finditer(r"Attention(?!Pool)", class_name)):
+                    self.assertTrue(submodule.config._attn_implementation == "sdpa")
+
+    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
+        if not self.model_tester.is_training:
+            self.skipTest(reason="ModelTester is not configured to run training tests")
+        """
+        We skip some parameters when checking for gradient checkpointing:
+        - VQ model, as its training is not supported.
+        - A few other modules used for image generation.
+        """
+        skip_patterns = ["vqmodel", "generation_embeddings", "generation_aligner", "generation_head"]
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                if (
+                    model_class.__name__
+                    in [
+                        *get_values(MODEL_MAPPING_NAMES),
+                        *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
+                    ]
+                    or not model_class.supports_gradient_checkpointing
+                ):
+                    # TODO (ydshieh): use `skipTest` once pytest-dev/pytest-subtests/pull/169 is merged
+                    # self.skipTest(reason=f"`supports_gradient_checkpointing` is False for {model_class.__name__}.")
+                    continue
+
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+                config.use_cache = False
+                config.return_dict = True
+                model = model_class(config)
+
+                model.to(torch_device)
+                model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
+                model.train()
+
+                # unfreeze additional layers
+                for p in model.parameters():
+                    p.requires_grad_(True)
+
+                optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+                inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                loss = model(**inputs).loss
+                loss.backward()
+                optimizer.step()
+
+                if self.test_all_params_have_gradient:
+                    for k, v in model.named_parameters():
+                        if v.requires_grad and not reduce(lambda t, s: t | (s in k), skip_patterns, False):
+                            self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
+                        else:
+                            pass
+
+    @unittest.skip("There are recompilations in Janus")  # TODO (joao, raushan): fix me
+    @pytest.mark.torch_compile_test
+    def test_generate_compile_model_forward_fullgraph(self):
+        pass
+
+
+class JanusVQModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        is_training=False,
+        initializer_range=0.02,
+        image_size=30,
+        num_embeds=12,
+        base_channels=32,  # we have a GroupNorm of 32 groups, so can't do less
+        embed_dim=12,
+        channel_multiplier=[1, 2],
+        patch_size=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.initializer_range = initializer_range
+        self.image_size = image_size
+        self.base_channels = base_channels
+        self.num_embeds = num_embeds
+        self.embed_dim = embed_dim
+        self.channel_multiplier = channel_multiplier
+        self.num_patches = image_size // patch_size
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, 3, self.image_size, self.image_size])
+        config = self.get_config()
+        return config, pixel_values
+
+    def get_config(self):
+        return JanusVQVAEConfig(
+            embed_dim=self.embed_dim,
+            num_embeddings=self.num_embeds,
+            latent_channels=self.embed_dim,
+            in_channels=3,
+            base_channels=self.base_channels,
+            channel_multiplier=self.channel_multiplier,
+            initializer_range=self.initializer_range,
+            resolution=self.image_size,
+            num_patches=self.num_patches,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class JanusVQModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (JanusVQVAE,) if is_torch_available() else ()
+    test_head_masking = False
+    test_pruning = False
+    fx_compatible = False
+    has_attentions = False
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = JanusVQModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=JanusVQVAEConfig,
+            has_text_modality=False,
+            common_properties=["embed_dim", "num_embeddings"],
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip("Janus VQ module cannot offload due to using `self.weight` directly")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip("Janus VQ module cannot offload due to using `self.weight` directly")
+    def test_disk_offload_bin(self):
+        pass
+
+    @unittest.skip("Janus VQ module cannot offload due to using `self.weight` directly")
+    def test_disk_offload_safetensors(self):
+        pass
+
+    @unittest.skip("Janus VQ module has no hidden states")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip("Janus VQ module has no hidden states")
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @unittest.skip("Janus VQ module has no get/set embeddings method")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip("Janus VQ module has no hidden states")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+
+class JanusIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.model_id = "deepseek-community/Janus-Pro-1B"
+
+    @slow
+    def test_model_text_generation(self):
+        model = JanusForConditionalGeneration.from_pretrained(self.model_id, device_map="auto")
+        model.eval()
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        image = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        prompt = "<image_placeholder>\nDescribe what do you see here and tell me about the history behind it?"
+        inputs = processor(images=image, text=prompt, generation_mode="text", return_tensors="pt").to(model.device)
+
+        output = model.generate(**inputs, max_new_tokens=20, generation_mode="text", do_sample=False)
+        EXPECTED_DECODED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\n\nDescribe what do you see here and tell me about the history behind it?\n\nThe image depicts the constellation of Leo, which is often referred to as the "Lion"'  # fmt: skip
+        text = processor.decode(output[0], skip_special_tokens=True)
+        self.assertEqual(
+            text,
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    def test_model_text_generation_batched(self):
+        model = JanusForConditionalGeneration.from_pretrained(self.model_id, device_map="auto")
+        processor = AutoProcessor.from_pretrained(self.model_id)
+
+        image_1 = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        image_2 = Image.open(
+            requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
+        )
+        prompts = [
+            "<image_placeholder>\nDescribe what do you see here and tell me about the history behind it?",
+            "What constellation is this image showing?<image_placeholder>\n",
+        ]
+
+        inputs = processor(
+            images=[image_1, image_2], text=prompts, generation_mode="text", padding=True, return_tensors="pt"
+        ).to(model.device, torch.float16)
+
+        EXPECTED_TEXT_COMPLETION = [
+            'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\n\nDescribe what do you see here and tell me about the history behind it?\n\nThe image depicts the constellation of Leo, which is often referred to as the "Lion"',
+            "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nWhat constellation is this image showing?\n\nThe image shows a constellation that is shaped like a stylized figure with a long tail. This",
+        ]
+        generated_ids = model.generate(**inputs, max_new_tokens=20, generation_mode="text", do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @slow
+    def test_model_text_generation_with_multi_image(self):
+        model = JanusForConditionalGeneration.from_pretrained(self.model_id, device_map="auto")
+        processor = AutoProcessor.from_pretrained(self.model_id)
+
+        image_1 = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        image_2 = Image.open(
+            requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
+        )
+        prompt = "What do these two images <image_placeholder> and <image_placeholder> have in common?"
+
+        inputs = processor(images=[image_1, image_2], text=prompt, generation_mode="text", return_tensors="pt").to(
+            model.device, torch.float16
+        )
+
+        EXPECTED_TEXT_COMPLETION = ['You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nWhat do these two images  and  have in common?\n\nThe two images you provided are of the same constellation. The first image shows the constellation of Leo, and the second image shows the constellation of Ursa Major. Both constellations are part of']  # fmt: skip
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @slow
+    def test_model_generate_images(self):
+        model = JanusForConditionalGeneration.from_pretrained(self.model_id, device_map="auto")
+        processor = AutoProcessor.from_pretrained(self.model_id)
+
+        inputs = processor(
+            text=["A portrait of young girl. masterpiece, film grained, best quality."],
+            padding=True,
+            generation_mode="image",
+            return_tensors="pt",
+        ).to(model.device)
+
+        self.assertTrue(inputs.input_ids.shape[1] == 17)
+
+        out = model.generate(
+            **inputs,
+            generation_mode="image",
+            do_sample=False,
+        )
+
+        # It should run for num_image_tokens in this case 576.
+        self.assertTrue(out.shape[1] == 576)
+
+        # fmt: off
+        expected_tokens =  Expectations(
+            {
+                ("rocm", None): [
+                    10367, 1380, 4841, 15155, 1224, 16361, 15834, 13722, 15258, 8321, 10496, 14532, 8770, 12353, 5481,
+                    11484, 2585, 8587, 3201, 14292, 3356, 2037, 3077, 6107, 3758, 2572, 9376, 13219, 6007, 14292, 12696,
+                    10666, 10046, 13483, 8282, 9101, 5208, 4260, 13886, 13335, 6135, 2316, 15423, 311, 5460, 12218,
+                    14172, 8583, 14577, 3648
+                ],
+                ("rocm", (9, 5)): [
+                    4484, 4015, 15750, 506, 3758, 11651, 8597, 5739, 4861, 971, 14985, 14834, 15438, 7548, 1820, 1465,
+                    13529, 12761, 10503, 12761, 14303, 6155, 4015, 11766, 705, 15736, 14146, 10417, 1951, 7713, 14305,
+                    15617, 6169, 2706, 8006, 14893, 3855, 10188, 15652, 6297, 1097, 12108, 15038, 311, 14998, 15165,
+                    897, 4044, 1762, 4676
+                ],
+                ("cuda", None): [
+                    4484, 4015, 15750, 506, 3758, 11651, 8597, 5739, 4861, 971, 14985, 14834, 15438, 7548, 1820, 1465,
+                    13529, 12761, 10503, 12761, 14303, 6155, 4015, 11766, 705, 15736, 14146, 10417, 1951, 7713, 14305,
+                    15617, 6169, 2706, 8006, 14893, 3855, 10188, 15652, 6297, 1097, 12108, 15038, 311, 14998, 15165,
+                    897, 4044, 1762, 4676
+                ],
+            }
+        )
+        expected_tokens = torch.tensor(expected_tokens.get_expectation()).to(model.device)
+        # fmt: on
+
+        # Compare the first 50 generated tokens.
+        self.assertTrue(torch.allclose(expected_tokens, out[0][:50]))
+
+        # Decode generated tokens to pixel values and postprocess them.
+        decoded_pixel_values = model.decode_image_tokens(out)
+        images = processor.postprocess(list(decoded_pixel_values.float()), return_tensors="np")
+
+        self.assertTrue(images["pixel_values"].shape == (1, 384, 384, 3))
+        self.assertTrue(isinstance(images["pixel_values"], np.ndarray))
--- a/transformers/tests/models/janus/test_processing_janus.py
+++ b/transformers/tests/models/janus/test_processing_janus.py
@@ -0,0 +1,465 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Janus model."""
+
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import AutoProcessor, AutoTokenizer, JanusProcessor
+
+from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
+
+
+class JanusProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = JanusProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        special_image_tokens = {
+            "image_token": "<image_placeholder>",
+            "boi_token": "<begin_of_image>",
+            "eoi_token": "<end_of_image>",
+        }
+
+        processor = self.processor_class.from_pretrained(
+            "deepseek-community/Janus-Pro-1B",
+            extra_special_tokens=special_image_tokens,
+            **self.prepare_processor_dict(),
+        )
+        # Set the processor to use the default system prompt to False as it's used based on input modality.
+        # Hence set to False to avoid any issues in the test irrespective of inputs.
+        processor.use_default_system_prompt = False
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_processor(self):
+        return AutoProcessor.from_pretrained(self.tmpdirname)
+
+    def test_chat_template_single(self):
+        """
+        Tests that the chat template matches the original implementation when applied to a single message.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        # Single image message
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                        {"type": "image"},
+                    ],
+                },
+            ]
+        ]
+
+        correct_prompt = ["<|User|>: What is shown in this image?\n<image_placeholder>\n\n<|Assistant|>:"]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompt, correct_prompt)
+
+        # Single image message with capitalization
+        messages = [
+            [
+                {
+                    "role": "User",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                        {"type": "image"},
+                    ],
+                },
+            ]
+        ]
+
+        correct_prompt = ["<|User|>: What is shown in this image?\n<image_placeholder>\n\n<|Assistant|>:"]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompt, correct_prompt)
+
+        # Single image message with uppercase
+        messages = [
+            [
+                {
+                    "role": "USER",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                        {"type": "image"},
+                    ],
+                },
+            ]
+        ]
+
+        correct_prompt = ["<|User|>: What is shown in this image?\n<image_placeholder>\n\n<|Assistant|>:"]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompt, correct_prompt)
+
+        """
+        Warning: normally, the other models have a test comparing chat template+tokenization as two separate steps
+        versus as a single step (i.e. processor.apply_chat_template(..., tokenize=True)). However, our processor has
+        some extra steps other than simply applying prompt to tokenizer. These include prepending the default system
+        prompts and, following the implementation from the Janus codebase, expanding the image token.
+        """
+
+        # Checking the output dict keys
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Now test the ability to return dict
+        messages[0][0]["content"][1].update(
+            {
+                "type": "image",
+                "url": url_to_local_path(
+                    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+                ),
+            }
+        )
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertTrue(self.images_input_name in out_dict)
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 1)
+        self.assertEqual(len(out_dict["attention_mask"]), 1)
+        self.assertEqual(len(out_dict[self.images_input_name]), 1)
+
+        # Passing generation prompt explicitly
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                        {"type": "image"},
+                    ],
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": ""},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=False)
+        self.assertEqual(formatted_prompt, correct_prompt)
+
+        # Single prompt with multiple images
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Compare this image"},
+                        {"type": "image"},
+                        {"type": "text", "text": "with this image"},
+                        {"type": "image"},
+                    ],
+                },
+            ]
+        ]
+
+        correct_prompt = [
+            "<|User|>: Compare this image\n<image_placeholder>\nwith this image\n<image_placeholder>\n\n<|Assistant|>:"
+        ]
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompt, correct_prompt)
+
+        # Multiple turns and multiple images
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Compare this image"},
+                        {"type": "image"},
+                        {"type": "text", "text": "with this image"},
+                        {"type": "image"},
+                    ],
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": "The first image is an equation, the second is a pie chart."},
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {
+                            "type": "text",
+                            "text": "What about this third image? To which of the previous to is it more similar?",
+                        },
+                    ],
+                },
+            ]
+        ]
+
+        correct_prompt = [
+            "<|User|>: Compare this image\n<image_placeholder>\nwith this image\n<image_placeholder>\n\n<|Assistant|>: The first image is an equation, the second is a pie chart.<｜end▁of▁sentence｜><|User|>: <image_placeholder>\nWhat about this third image? To which of the previous to is it more similar?\n\n<|Assistant|>:"
+        ]
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompt, correct_prompt)
+
+    def test_chat_template_batched(self):
+        """
+        Tests that the chat template matches the original implementation when applied to a batch of messages.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        # Test 1: Simple single image per message batch
+        batched_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                        {"type": "image"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                        {"type": "image"},
+                    ],
+                },
+            ],
+        ]
+
+        correct_prompts = [
+            "<|User|>: What is shown in this image?\n<image_placeholder>\n\n<|Assistant|>:",
+            "<|User|>: What is shown in this image?\n<image_placeholder>\n\n<|Assistant|>:",
+        ]
+
+        formatted_prompts = processor.apply_chat_template(batched_messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompts, correct_prompts)
+
+        # Similarly to the single case, no test for chat template+tokenization as two separate steps versus as a single step
+
+        # Checking the output dict keys
+        out_dict = processor.apply_chat_template(
+            batched_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            padding=True,
+        )
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Verify image inputs are included in the output dict
+        batched_messages[0][0]["content"][1].update(
+            {
+                "type": "image",
+                "url": url_to_local_path(
+                    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+                ),
+            }
+        )
+        batched_messages[1][0]["content"][1].update(
+            {"type": "image", "url": url_to_local_path("http://images.cocodataset.org/val2017/000000039769.jpg")}
+        )
+        out_dict = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+        )
+        self.assertTrue(self.images_input_name in out_dict)
+        self.assertEqual(len(out_dict["input_ids"]), 2)  # Batch size for text
+        self.assertEqual(len(out_dict["attention_mask"]), 2)  # Batch size for attention mask
+        self.assertEqual(len(out_dict[self.images_input_name]), 2)  # Batch size for images
+
+        # Test 2: Two images per message batch with different prompts
+        batched_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Compare this image"},
+                        {"type": "image"},
+                        {"type": "text", "text": "with this image"},
+                        {"type": "image"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": "Describe how the previous image compares to the following"},
+                        {"type": "image"},
+                    ],
+                },
+            ],
+        ]
+
+        correct_prompts = [
+            "<|User|>: Compare this image\n<image_placeholder>\nwith this image\n<image_placeholder>\n\n<|Assistant|>:",
+            "<|User|>: <image_placeholder>\nDescribe how the previous image compares to the following\n<image_placeholder>\n\n<|Assistant|>:",
+        ]
+        formatted_prompts = processor.apply_chat_template(batched_messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompts, correct_prompts)
+
+        # Test 3: Multi-turn conversations with multiple images
+        batched_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Compare this image"},
+                        {"type": "image"},
+                        {"type": "text", "text": "with this image"},
+                        {"type": "image"},
+                    ],
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": "The first image is an equation, the second is a pie chart."},
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {
+                            "type": "text",
+                            "text": "What about this third image? To which of the previous to is it more similar?",
+                        },
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": "Describe how the previous image compares to the following"},
+                        {"type": "image"},
+                    ],
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": "The first image is a formula, the second is a plot."},
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Which of them is closer to the following?"},
+                        {"type": "image"},
+                    ],
+                },
+            ],
+        ]
+
+        correct_prompts = [
+            "<|User|>: Compare this image\n<image_placeholder>\nwith this image\n<image_placeholder>\n\n<|Assistant|>: The first image is an equation, the second is a pie chart.<｜end▁of▁sentence｜><|User|>: <image_placeholder>\nWhat about this third image? To which of the previous to is it more similar?\n\n<|Assistant|>:",
+            "<|User|>: <image_placeholder>\nDescribe how the previous image compares to the following\n<image_placeholder>\n\n<|Assistant|>: The first image is a formula, the second is a plot.<｜end▁of▁sentence｜><|User|>: Which of them is closer to the following?\n<image_placeholder>\n\n<|Assistant|>:",
+        ]
+        formatted_prompts = processor.apply_chat_template(batched_messages, add_generation_prompt=True)
+        self.assertEqual(formatted_prompts, correct_prompts)
+
+    def test_chat_template_accepts_processing_kwargs(self):
+        """Tests that the chat template correctly handles additional processing arguments."""
+        # Get processor and skip if it doesn't have a chat template
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        # Create a simple text message for testing
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        # Test 1: Padding to max_length
+        # PS: we have to override the parent max_length of 50 to 80 because the output is already 51 tokens
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            padding="max_length",
+            max_length=80,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 80)
+
+        # Test 2: Truncation
+        # Verify that the output is truncated to exactly 5 tokens
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            truncation=True,
+            max_length=5,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
+
+        # Test 3: Image processing kwargs
+        # Add an image and test image processing parameters
+        messages[0][0]["content"].append(
+            {
+                "type": "image",
+                "url": url_to_local_path(
+                    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+                ),
+            }
+        )
+        # Process with image rescaling and verify the pixel values are negative
+        out_dict = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            do_rescale=True,
+            rescale_factor=-1,
+            return_tensors="np",
+        )
+        self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
+
+    def test_processor_postprocess(self):
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
+
+        input_str = "lower newer"
+        orig_image_input = self.prepare_image_inputs()
+        orig_image = np.array(orig_image_input).transpose(2, 0, 1)
+
+        inputs = processor(text=input_str, images=orig_image, do_resize=False, do_pad=False, return_tensors="np")
+        normalized_image_input = inputs.pixel_values
+        unnormalized_images = processor.postprocess(normalized_image_input, return_tensors="np")["pixel_values"]
+
+        # For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling
+        self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1)