初始化项目,由ModelHub XC社区提供模型
Model: AI-ModelScope/R-4B Source: Original Platform
This commit is contained in:
499
image_processing_r.py
Normal file
499
image_processing_r.py
Normal file
@@ -0,0 +1,499 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_processing_utils import (
|
||||
BaseImageProcessor,
|
||||
BatchFeature,
|
||||
get_patch_output_size,
|
||||
get_size_dict,
|
||||
select_best_resolution,
|
||||
)
|
||||
from transformers.image_transforms import (
|
||||
PaddingMode,
|
||||
convert_to_rgb,
|
||||
pad,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from transformers.image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
make_flat_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from transformers.utils import TensorType, is_vision_available, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
|
||||
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> list[np.array]:
|
||||
"""
|
||||
Divides an image into patches of a specified size.
|
||||
|
||||
Args:
|
||||
image (`np.array`):
|
||||
The input image.
|
||||
patch_size (`int`):
|
||||
The size of each patch.
|
||||
input_data_format (`ChannelDimension` or `str`):
|
||||
The channel dimension format of the input image.
|
||||
|
||||
Returns:
|
||||
list: A list of np.array representing the patches.
|
||||
"""
|
||||
patches = []
|
||||
height, width = get_image_size(image, channel_dim=input_data_format)
|
||||
for i in range(0, height, patch_size):
|
||||
for j in range(0, width, patch_size):
|
||||
if input_data_format == ChannelDimension.LAST:
|
||||
patch = image[i : i + patch_size, j : j + patch_size]
|
||||
else:
|
||||
patch = image[:, i : i + patch_size, j : j + patch_size]
|
||||
patches.append(patch)
|
||||
|
||||
return patches
|
||||
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.expand_to_square
|
||||
def expand_to_square(image: np.array, background_color, input_data_format) -> np.array:
|
||||
"""
|
||||
Expands an image to a square by adding a background color.
|
||||
"""
|
||||
|
||||
height, width = get_image_size(image, channel_dim=input_data_format)
|
||||
if width == height:
|
||||
return image
|
||||
elif width > height:
|
||||
result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
|
||||
result[(width - height) // 2 : (width - height) // 2 + height, :] = image
|
||||
return result
|
||||
else:
|
||||
result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
|
||||
result[:, (height - width) // 2 : (height - width) // 2 + width] = image
|
||||
return result
|
||||
|
||||
|
||||
class RImageProcessor(BaseImageProcessor):
|
||||
model_input_names = ["pixel_values_videos"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
image_grid_pinpoints: Optional[list] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
do_pad: Optional[bool] = True,
|
||||
do_convert_rgb: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
size = size if size is not None else {"height": 384, "width": 384}
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
image_grid_pinpoints = (
|
||||
image_grid_pinpoints
|
||||
if image_grid_pinpoints is not None
|
||||
else [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
|
||||
)
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.image_grid_pinpoints = image_grid_pinpoints
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
self.do_pad = do_pad
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
padding: Union[int, tuple[int, int], Iterable[tuple[int, int]]],
|
||||
mode: PaddingMode = PaddingMode.CONSTANT,
|
||||
constant_values: Union[float, Iterable[float]] = 0.0,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> np.ndarray:
|
||||
|
||||
# call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim
|
||||
if isinstance(padding, int) or len(padding) != 4:
|
||||
return pad(image, padding, mode, constant_values, data_format, input_data_format)
|
||||
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(image)
|
||||
if mode == PaddingMode.CONSTANT:
|
||||
image = np.pad(image, padding, mode="constant", constant_values=constant_values)
|
||||
elif mode == PaddingMode.REFLECT:
|
||||
image = np.pad(image, padding, mode="reflect")
|
||||
elif mode == PaddingMode.REPLICATE:
|
||||
image = np.pad(image, padding, mode="edge")
|
||||
elif mode == PaddingMode.SYMMETRIC:
|
||||
image = np.pad(image, padding, mode="symmetric")
|
||||
else:
|
||||
raise ValueError(f"Invalid padding mode: {mode}")
|
||||
image = (
|
||||
to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
|
||||
)
|
||||
return image
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._resize_for_patching
|
||||
def _resize_for_patching(
|
||||
self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
|
||||
) -> np.array:
|
||||
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
|
||||
# Resize the image
|
||||
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
|
||||
|
||||
return resized_image
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._get_padding_size
|
||||
def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
|
||||
original_height, original_width = original_resolution
|
||||
target_height, target_width = target_resolution
|
||||
paste_x, r_x = divmod(target_width - original_width, 2)
|
||||
paste_y, r_y = divmod(target_height - original_height, 2)
|
||||
return (paste_y, paste_y + r_y), (paste_x, paste_x + r_x)
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_patching
|
||||
def _pad_for_patching(
|
||||
self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
|
||||
) -> np.array:
|
||||
"""
|
||||
Pad an image to a target resolution while maintaining aspect ratio.
|
||||
"""
|
||||
new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
padding = self._get_padding_size(new_resolution, target_resolution)
|
||||
|
||||
padded_image = self.pad(image, padding=padding)
|
||||
|
||||
return padded_image
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.get_image_patches
|
||||
def get_image_patches(
|
||||
self,
|
||||
image: np.array,
|
||||
grid_pinpoints,
|
||||
size: tuple,
|
||||
patch_size: int,
|
||||
resample: PILImageResampling,
|
||||
data_format: ChannelDimension,
|
||||
input_data_format: ChannelDimension,
|
||||
) -> list[np.array]:
|
||||
if not isinstance(grid_pinpoints, list):
|
||||
raise TypeError("grid_pinpoints must be a list of possible resolutions.")
|
||||
|
||||
possible_resolutions = grid_pinpoints
|
||||
|
||||
image_size = get_image_size(image, channel_dim=input_data_format)
|
||||
best_resolution = select_best_resolution(image_size, possible_resolutions)
|
||||
resized_image = self._resize_for_patching(
|
||||
image, best_resolution, resample=resample, input_data_format=input_data_format
|
||||
)
|
||||
padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
|
||||
|
||||
patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format)
|
||||
|
||||
# make sure that all patches are in the input data format
|
||||
patches = [
|
||||
to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format)
|
||||
for patch in patches
|
||||
]
|
||||
|
||||
resized_original_image = resize(
|
||||
image,
|
||||
size=size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
|
||||
image_patches = [resized_original_image] + patches
|
||||
|
||||
return image_patches
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_batching
|
||||
def _pad_for_batching(
|
||||
self,
|
||||
pixel_values: list[np.ndarray],
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
):
|
||||
max_patch = max(len(x) for x in pixel_values)
|
||||
pixel_values = [
|
||||
self.pad(
|
||||
image,
|
||||
padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)),
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
for image in pixel_values
|
||||
]
|
||||
|
||||
return pixel_values
|
||||
|
||||
# Copied from transformers.models.llava.image_processing_llava.LlavaImageProcessor.pad_to_square
|
||||
def pad_to_square(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
background_color: Union[int, tuple[int, int, int]] = 0,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> np.array:
|
||||
height, width = get_image_size(image, input_data_format)
|
||||
num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
|
||||
|
||||
if height == width:
|
||||
image = (
|
||||
to_channel_dimension_format(image, data_format, input_data_format)
|
||||
if data_format is not None
|
||||
else image
|
||||
)
|
||||
return image
|
||||
|
||||
max_dim = max(height, width)
|
||||
|
||||
# Ensure background_color is the correct shape
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color]
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
if input_data_format == ChannelDimension.FIRST:
|
||||
result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
|
||||
for i, color in enumerate(background_color):
|
||||
result[i, :, :] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
result[:, start : start + height, :] = image
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
result[:, :, start : start + width] = image
|
||||
else:
|
||||
result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
|
||||
for i, color in enumerate(background_color):
|
||||
result[:, :, i] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
result[start : start + height, :, :] = image
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
result[:, start : start + width, :] = image
|
||||
|
||||
image = (
|
||||
to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
|
||||
)
|
||||
return image
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Image.Image:
|
||||
if do_resize:
|
||||
images = [
|
||||
resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
if do_rescale:
|
||||
images = [
|
||||
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
if do_normalize:
|
||||
images = [
|
||||
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
|
||||
]
|
||||
|
||||
return images
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
image_grid_pinpoints: Optional[list] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
do_pad: Optional[bool] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
):
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_pad = do_pad if do_pad is not None else self.do_pad
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
|
||||
# if the first element is a list, we assume that all elements are lists
|
||||
batch_num_images = [len(x) for x in images]
|
||||
elif isinstance(images, (tuple, list)):
|
||||
# treat this as a single-image case for backward compatibility
|
||||
batch_num_images = [1] * len(images)
|
||||
else:
|
||||
batch_num_images = [1]
|
||||
# only single image patching is supported
|
||||
need_patching = [n == 1 for n in batch_num_images for _ in range(n)]
|
||||
|
||||
images = make_flat_list_of_images(images)
|
||||
|
||||
if not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
|
||||
validate_preprocess_arguments(
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if do_rescale and is_scaled_image(images[0]):
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
size_tuple = (
|
||||
(size["height"], size["width"])
|
||||
if "height" in size and "width" in size
|
||||
else (size["shortest_edge"], size["shortest_edge"])
|
||||
)
|
||||
|
||||
new_images = []
|
||||
image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
|
||||
for i, image in enumerate(images):
|
||||
if need_patching[i]:
|
||||
# convert image into a list of patches
|
||||
# we intentionally use the same data format as the input data format
|
||||
image_patches = self.get_image_patches(
|
||||
image,
|
||||
image_grid_pinpoints,
|
||||
size=size_tuple,
|
||||
patch_size=size_tuple[0],
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
else:
|
||||
padded_image = self.pad_to_square(
|
||||
image=image,
|
||||
background_color=tuple(int(x * 255) for x in self.image_mean),
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
image_patches = [padded_image]
|
||||
|
||||
# preprocess patches
|
||||
pixel_values = self._preprocess(
|
||||
image_patches,
|
||||
do_resize=do_resize,
|
||||
size=size_tuple,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
pixel_values = np.array(pixel_values)
|
||||
new_images.append(pixel_values)
|
||||
|
||||
if do_pad:
|
||||
processed_images = self._pad_for_batching(new_images)
|
||||
|
||||
return BatchFeature(
|
||||
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["RImageProcessor"]
|
||||
Reference in New Issue
Block a user