R-4B/image_processing_r_fast.py

# coding=utf-8
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union

import torch

from transformers.image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
from transformers.image_processing_utils_fast import (
    BaseImageProcessorFast,
    DefaultFastImageProcessorKwargs,
    divide_to_patches,
    group_images_by_shape,
    reorder_images,
)
from transformers.image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    SizeDict,
    get_image_size,
    make_flat_list_of_images,
)
from transformers.processing_utils import Unpack
from transformers.utils import TensorType, auto_docstring, is_torchvision_v2_available


if is_torchvision_v2_available():
    from torchvision.transforms.v2 import functional as F
else:
    from torchvision.transforms import functional as F


class RFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
    image_grid_pinpoints: Optional[list[list[int]]]
    do_pad: Optional[bool]


@auto_docstring
class RImageProcessorFast(BaseImageProcessorFast):
    resample = PILImageResampling.BICUBIC
    image_mean = OPENAI_CLIP_MEAN
    image_std = OPENAI_CLIP_STD
    size = {"height": 384, "width": 384}
    default_to_square = False
    crop_size = None
    do_resize = True
    do_center_crop = None
    do_rescale = True
    do_normalize = True
    do_convert_rgb = True
    do_pad = True
    image_grid_pinpoints =  [[384,768],[768,384],[768,768],[1152,384],[384,1152]],
    valid_kwargs = RFastImageProcessorKwargs
    model_input_names = ["pixel_values_videos"]

    def __init__(self, **kwargs: Unpack[RFastImageProcessorKwargs]):
        super().__init__(**kwargs)

    @auto_docstring
    def preprocess(
        self, images: ImageInput, **kwargs: Unpack[RFastImageProcessorKwargs]
    ) -> BatchFeature:
        if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
            # if the first element is a list, we assume that all elements are lists
            batch_num_images = [len(x) for x in images]
        elif isinstance(images, (tuple, list)):
            # treat this as a single-image case for backward compatibility
            batch_num_images = [1] * len(images)
        else:
            batch_num_images = [1]
        kwargs["batch_num_images"] = batch_num_images
        return super().preprocess(images, **kwargs)

    def _prepare_images_structure(
        self,
        images: ImageInput,
    ) -> ImageInput:
        return make_flat_list_of_images(images)

    def _resize_for_patching(
        self,
        image: "torch.Tensor",
        target_resolution: tuple,
        interpolation: "F.InterpolationMode",
        input_data_format: ChannelDimension,
    ) -> "torch.Tensor":

        new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

        # Resize the image
        resized_image = F.resize(image, (new_height, new_width), interpolation=interpolation)

        return resized_image

    def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
        original_height, original_width = original_resolution
        target_height, target_width = target_resolution
        paste_x, r_x = divmod(target_width - original_width, 2)
        paste_y, r_y = divmod(target_height - original_height, 2)
        return [paste_x, paste_y, paste_x + r_x, paste_y + r_y]

    def _pad_for_patching(
        self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension
    ) -> "torch.Tensor":
        """
        Pad an image to a target resolution while maintaining aspect ratio.
        """
        new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
        padding = self._get_padding_size(new_resolution, target_resolution)

        padded_image = F.pad(image, padding=padding)

        return padded_image

    def _get_image_patches(
        self,
        image: "torch.Tensor",
        grid_pinpoints,
        size: tuple,
        patch_size: int,
        interpolation: "F.InterpolationMode",
    ) -> list["torch.Tensor"]:
        """
        Process an image with variable resolutions by dividing it into patches.

        Args:
            image ("torch.Tensor"):
                The input image to be processed.
            grid_pinpoints (List):
                A string representation of a list of possible resolutions.
            size (`tuple`):
                Size to resize the original image to.
            patch_size (`int`):
                Size of the patches to divide the image into.
            interpolation (`"InterpolationMode"`):
                Resampling filter to use if resizing the image.

        Returns:
            list["torch.Tensor"]: A list of NumPy arrays containing the processed image patches.
        """
        if not isinstance(grid_pinpoints, list):
            raise TypeError("grid_pinpoints must be a list of possible resolutions.")

        possible_resolutions = grid_pinpoints

        image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
        best_resolution = select_best_resolution(image_size, possible_resolutions)
        resized_image = self._resize_for_patching(
            image, best_resolution, interpolation=interpolation, input_data_format=ChannelDimension.FIRST
        )
        padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=ChannelDimension.FIRST)
        patches = divide_to_patches(padded_image, patch_size=patch_size)
        resized_original_image = F.resize(image, size=size, interpolation=interpolation)

        image_patches = [resized_original_image] + patches

        return image_patches

    def _pad_for_batching(
        self,
        pixel_values: list["torch.Tensor"],
    ) -> list["torch.Tensor"]:
        """
        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.

        Args:
            pixel_values (`list[torch.Tensor]`):
                An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)

        Returns:
            list[`torch.Tensor`]: The padded images.
        """
        max_patch = max(len(x) for x in pixel_values)
        pixel_values = [
            torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]])
            for image in pixel_values
        ]

        return pixel_values

    def _preprocess(
        self,
        images: list["torch.Tensor"],
        do_resize: bool,
        size: SizeDict,
        image_grid_pinpoints: list[list[int]],
        interpolation: Optional["F.InterpolationMode"],
        do_center_crop: bool,
        crop_size: SizeDict,
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
        image_mean: Optional[Union[float, list[float]]],
        image_std: Optional[Union[float, list[float]]],
        do_pad: bool,
        batch_num_images: list[int],
        return_tensors: Optional[Union[str, TensorType]],
    ) -> BatchFeature:
        processed_images = []
        image_sizes = []

        # only single image patching is supported
        need_patching = [n == 1 for n in batch_num_images for _ in range(n)]

        # Determine the size tuple
        if size and size.height and size.width:
            size_tuple = (size.height, size.width)
        else:
            size_tuple = (size.shortest_edge, size.shortest_edge)

        # Determine the patch size
        if crop_size and crop_size.height:
            patch_size = crop_size.height
        elif size and size.height:
            patch_size = size.height
        else:
            patch_size = size.shortest_edge

        for i, image in enumerate(images):
            if need_patching[i]:
                image_patches = self._get_image_patches(
                    image,
                    image_grid_pinpoints,
                    size=size_tuple,
                    patch_size=patch_size,
                    interpolation=interpolation,
                )
            else:
                padded_image = self.pad_to_square(
                    images=image, background_color=tuple(int(x * 255) for x in self.image_mean)
                )
                image_patches = [padded_image]

            # Group images by size for batched processing
            processed_image_patches_grouped = {}
            grouped_image_patches, grouped_image_patches_index = group_images_by_shape(image_patches)
            for shape, stacked_image_patches in grouped_image_patches.items():
                if do_resize:
                    stacked_image_patches = self.resize(
                        image=stacked_image_patches,
                        size=size,
                        interpolation=interpolation,
                    )
                if do_center_crop:
                    stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)
                # Fused rescale and normalize
                stacked_image_patches = self.rescale_and_normalize(
                    stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std
                )
                processed_image_patches_grouped[shape] = stacked_image_patches
            processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
            processed_image_patches = (
                torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches
            )
            processed_images.append(processed_image_patches)
            image_sizes.append(get_image_size(image, ChannelDimension.FIRST))

        if do_pad:
            processed_images = self._pad_for_batching(processed_images)
        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
        return BatchFeature(
            data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
            tensor_type=return_tensors,
        )

    # Copied from transformers.models.llava.image_processing_llava_fast.LlavaImageProcessorFast.pad_to_square
    def pad_to_square(
        self,
        images: "torch.Tensor",
        background_color: Union[int, tuple[int, int, int]] = 0,
    ) -> "torch.Tensor":
        """
        Pads an image to a square based on the longest edge.

        Args:
            images (`np.ndarray`):
                The images to pad.
            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                The color to use for the padding. Can be an integer for single channel or a
                tuple of integers representing for multi-channel images. If passed as integer
                in mutli-channel mode, it will default to `0` in subsequent channels.
        Returns:
            `torch.Tensor`: The padded images.
        """
        height, width = get_image_size(images, ChannelDimension.FIRST)

        if height == width:
            return images

        num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
        if isinstance(background_color, int):
            background_color = [background_color] + [0] * (num_channels - 1)
        elif len(background_color) != num_channels:
            raise ValueError(
                f"background_color must have no more than {num_channels} elements to match the number of channels"
            )

        max_dim = max(height, width)
        paste_x_left = (max_dim - width) // 2
        paste_y_left = (max_dim - height) // 2
        paste_x_right = max_dim - width - paste_x_left
        paste_y_right = max_dim - height - paste_y_left
        padded_images = F.pad(
            images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
        )

        return padded_images


__all__ = ["RImageProcessorFast"]
初始化项目，由ModelHub XC社区提供模型 Model: AI-ModelScope/R-4B Source: Original Platform 2026-05-21 17:44:12 +08:00			`# coding=utf-8`
			`# Copyright 2024 HuggingFace Inc. team. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`from typing import Optional, Union`

			`import torch`

			`from transformers.image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution`
			`from transformers.image_processing_utils_fast import (`
			`BaseImageProcessorFast,`
			`DefaultFastImageProcessorKwargs,`
			`divide_to_patches,`
			`group_images_by_shape,`
			`reorder_images,`
			`)`
			`from transformers.image_utils import (`
			`OPENAI_CLIP_MEAN,`
			`OPENAI_CLIP_STD,`
			`ChannelDimension,`
			`ImageInput,`
			`PILImageResampling,`
			`SizeDict,`
			`get_image_size,`
			`make_flat_list_of_images,`
			`)`
			`from transformers.processing_utils import Unpack`
			`from transformers.utils import TensorType, auto_docstring, is_torchvision_v2_available`


			`if is_torchvision_v2_available():`
			`from torchvision.transforms.v2 import functional as F`
			`else:`
			`from torchvision.transforms import functional as F`


			`class RFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):`
			`image_grid_pinpoints: Optional[list[list[int]]]`
			`do_pad: Optional[bool]`


			`@auto_docstring`
			`class RImageProcessorFast(BaseImageProcessorFast):`
			`resample = PILImageResampling.BICUBIC`
			`image_mean = OPENAI_CLIP_MEAN`
			`image_std = OPENAI_CLIP_STD`
			`size = {"height": 384, "width": 384}`
			`default_to_square = False`
			`crop_size = None`
			`do_resize = True`
			`do_center_crop = None`
			`do_rescale = True`
			`do_normalize = True`
			`do_convert_rgb = True`
			`do_pad = True`
			`image_grid_pinpoints = [[384,768],[768,384],[768,768],[1152,384],[384,1152]],`
			`valid_kwargs = RFastImageProcessorKwargs`
			`model_input_names = ["pixel_values_videos"]`

			`def __init__(self, **kwargs: Unpack[RFastImageProcessorKwargs]):`
			`super().__init__(**kwargs)`

			`@auto_docstring`
			`def preprocess(`
			`self, images: ImageInput, **kwargs: Unpack[RFastImageProcessorKwargs]`
			`) -> BatchFeature:`
			`if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):`
			`# if the first element is a list, we assume that all elements are lists`
			`batch_num_images = [len(x) for x in images]`
			`elif isinstance(images, (tuple, list)):`
			`# treat this as a single-image case for backward compatibility`
			`batch_num_images = [1] * len(images)`
			`else:`
			`batch_num_images = [1]`
			`kwargs["batch_num_images"] = batch_num_images`
			`return super().preprocess(images, **kwargs)`

			`def _prepare_images_structure(`
			`self,`
			`images: ImageInput,`
			`) -> ImageInput:`
			`return make_flat_list_of_images(images)`

			`def _resize_for_patching(`
			`self,`
			`image: "torch.Tensor",`
			`target_resolution: tuple,`
			`interpolation: "F.InterpolationMode",`
			`input_data_format: ChannelDimension,`
			`) -> "torch.Tensor":`

			`new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)`

			`# Resize the image`
			`resized_image = F.resize(image, (new_height, new_width), interpolation=interpolation)`

			`return resized_image`

			`def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):`
			`original_height, original_width = original_resolution`
			`target_height, target_width = target_resolution`
			`paste_x, r_x = divmod(target_width - original_width, 2)`
			`paste_y, r_y = divmod(target_height - original_height, 2)`
			`return [paste_x, paste_y, paste_x + r_x, paste_y + r_y]`

			`def _pad_for_patching(`
			`self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension`
			`) -> "torch.Tensor":`
			`"""`
			`Pad an image to a target resolution while maintaining aspect ratio.`
			`"""`
			`new_resolution = get_patch_output_size(image, target_resolution, input_data_format)`
			`padding = self._get_padding_size(new_resolution, target_resolution)`

			`padded_image = F.pad(image, padding=padding)`

			`return padded_image`

			`def _get_image_patches(`
			`self,`
			`image: "torch.Tensor",`
			`grid_pinpoints,`
			`size: tuple,`
			`patch_size: int,`
			`interpolation: "F.InterpolationMode",`
			`) -> list["torch.Tensor"]:`
			`"""`
			`Process an image with variable resolutions by dividing it into patches.`

			`Args:`
			`image ("torch.Tensor"):`
			`The input image to be processed.`
			`grid_pinpoints (List):`
			`A string representation of a list of possible resolutions.`
			size (`tuple`):
			`Size to resize the original image to.`
			patch_size (`int`):
			`Size of the patches to divide the image into.`
			interpolation (`"InterpolationMode"`):
			`Resampling filter to use if resizing the image.`

			`Returns:`
			`list["torch.Tensor"]: A list of NumPy arrays containing the processed image patches.`
			`"""`
			`if not isinstance(grid_pinpoints, list):`
			`raise TypeError("grid_pinpoints must be a list of possible resolutions.")`

			`possible_resolutions = grid_pinpoints`

			`image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)`
			`best_resolution = select_best_resolution(image_size, possible_resolutions)`
			`resized_image = self._resize_for_patching(`
			`image, best_resolution, interpolation=interpolation, input_data_format=ChannelDimension.FIRST`
			`)`
			`padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=ChannelDimension.FIRST)`
			`patches = divide_to_patches(padded_image, patch_size=patch_size)`
			`resized_original_image = F.resize(image, size=size, interpolation=interpolation)`

			`image_patches = [resized_original_image] + patches`

			`return image_patches`

			`def _pad_for_batching(`
			`self,`
			`pixel_values: list["torch.Tensor"],`
			`) -> list["torch.Tensor"]:`
			`"""`
			Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.

			`Args:`
			pixel_values (`list[torch.Tensor]`):
			An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)

			`Returns:`
			list[`torch.Tensor`]: The padded images.
			`"""`
			`max_patch = max(len(x) for x in pixel_values)`
			`pixel_values = [`
			`torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]])`
			`for image in pixel_values`
			`]`

			`return pixel_values`

			`def _preprocess(`
			`self,`
			`images: list["torch.Tensor"],`
			`do_resize: bool,`
			`size: SizeDict,`
			`image_grid_pinpoints: list[list[int]],`
			`interpolation: Optional["F.InterpolationMode"],`
			`do_center_crop: bool,`
			`crop_size: SizeDict,`
			`do_rescale: bool,`
			`rescale_factor: float,`
			`do_normalize: bool,`
			`image_mean: Optional[Union[float, list[float]]],`
			`image_std: Optional[Union[float, list[float]]],`
			`do_pad: bool,`
			`batch_num_images: list[int],`
			`return_tensors: Optional[Union[str, TensorType]],`
			`) -> BatchFeature:`
			`processed_images = []`
			`image_sizes = []`

			`# only single image patching is supported`
			`need_patching = [n == 1 for n in batch_num_images for _ in range(n)]`

			`# Determine the size tuple`
			`if size and size.height and size.width:`
			`size_tuple = (size.height, size.width)`
			`else:`
			`size_tuple = (size.shortest_edge, size.shortest_edge)`

			`# Determine the patch size`
			`if crop_size and crop_size.height:`
			`patch_size = crop_size.height`
			`elif size and size.height:`
			`patch_size = size.height`
			`else:`
			`patch_size = size.shortest_edge`

			`for i, image in enumerate(images):`
			`if need_patching[i]:`
			`image_patches = self._get_image_patches(`
			`image,`
			`image_grid_pinpoints,`
			`size=size_tuple,`
			`patch_size=patch_size,`
			`interpolation=interpolation,`
			`)`
			`else:`
			`padded_image = self.pad_to_square(`
			`images=image, background_color=tuple(int(x * 255) for x in self.image_mean)`
			`)`
			`image_patches = [padded_image]`

			`# Group images by size for batched processing`
			`processed_image_patches_grouped = {}`
			`grouped_image_patches, grouped_image_patches_index = group_images_by_shape(image_patches)`
			`for shape, stacked_image_patches in grouped_image_patches.items():`
			`if do_resize:`
			`stacked_image_patches = self.resize(`
			`image=stacked_image_patches,`
			`size=size,`
			`interpolation=interpolation,`
			`)`
			`if do_center_crop:`
			`stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)`
			`# Fused rescale and normalize`
			`stacked_image_patches = self.rescale_and_normalize(`
			`stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std`
			`)`
			`processed_image_patches_grouped[shape] = stacked_image_patches`
			`processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)`
			`processed_image_patches = (`
			`torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches`
			`)`
			`processed_images.append(processed_image_patches)`
			`image_sizes.append(get_image_size(image, ChannelDimension.FIRST))`

			`if do_pad:`
			`processed_images = self._pad_for_batching(processed_images)`
			`processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images`
			`return BatchFeature(`
			`data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},`
			`tensor_type=return_tensors,`
			`)`

			`# Copied from transformers.models.llava.image_processing_llava_fast.LlavaImageProcessorFast.pad_to_square`
			`def pad_to_square(`
			`self,`
			`images: "torch.Tensor",`
			`background_color: Union[int, tuple[int, int, int]] = 0,`
			`) -> "torch.Tensor":`
			`"""`
			`Pads an image to a square based on the longest edge.`

			`Args:`
			images (`np.ndarray`):
			`The images to pad.`
			background_color (`int` or `tuple[int, int, int]`, optional, defaults to 0):
			`The color to use for the padding. Can be an integer for single channel or a`
			`tuple of integers representing for multi-channel images. If passed as integer`
			in mutli-channel mode, it will default to `0` in subsequent channels.
			`Returns:`
			`torch.Tensor`: The padded images.
			`"""`
			`height, width = get_image_size(images, ChannelDimension.FIRST)`

			`if height == width:`
			`return images`

			`num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]`
			`if isinstance(background_color, int):`
			`background_color = [background_color] + [0] * (num_channels - 1)`
			`elif len(background_color) != num_channels:`
			`raise ValueError(`
			`f"background_color must have no more than {num_channels} elements to match the number of channels"`
			`)`

			`max_dim = max(height, width)`
			`paste_x_left = (max_dim - width) // 2`
			`paste_y_left = (max_dim - height) // 2`
			`paste_x_right = max_dim - width - paste_x_left`
			`paste_y_right = max_dim - height - paste_y_left`
			`padded_images = F.pad(`
			`images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color`
			`)`

			`return padded_images`


			`__all__ = ["RImageProcessorFast"]`