Sync from v0.13
This commit is contained in:
25
vllm/transformers_utils/processors/__init__.py
Normal file
25
vllm/transformers_utils/processors/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Multi-modal processors may be defined in this directory for the following
|
||||
reasons:
|
||||
|
||||
- There is no processing file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing processor to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.processors.bagel import BagelProcessor
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
|
||||
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
|
||||
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
|
||||
from vllm.transformers_utils.processors.ovis import OvisProcessor
|
||||
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
|
||||
|
||||
__all__ = [
|
||||
"BagelProcessor",
|
||||
"DeepseekVLV2Processor",
|
||||
"HunYuanVLProcessor",
|
||||
"HunYuanVLImageProcessor",
|
||||
"OvisProcessor",
|
||||
"Ovis2_5Processor",
|
||||
]
|
||||
73
vllm/transformers_utils/processors/bagel.py
Normal file
73
vllm/transformers_utils/processors/bagel.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
|
||||
"""BAGEL processor for image and text inputs."""
|
||||
|
||||
from transformers import AutoProcessor
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
|
||||
class BagelProcessor(ProcessorMixin):
|
||||
"""
|
||||
Constructs a BAGEL processor which wraps a
|
||||
SigLIP image processor and a Qwen2 tokenizer.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "SiglipImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: TextInput
|
||||
| PreTokenizedInput
|
||||
| list[TextInput]
|
||||
| list[PreTokenizedInput] = None,
|
||||
images: ImageInput = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s).
|
||||
"""
|
||||
if images is not None:
|
||||
# Process images with the image processor
|
||||
# Ensure return_tensors is set to "pt" for PyTorch tensors
|
||||
image_kwargs = {**kwargs}
|
||||
if "return_tensors" not in image_kwargs:
|
||||
image_kwargs["return_tensors"] = "pt"
|
||||
pixel_values = self.image_processor(images, **image_kwargs)
|
||||
else:
|
||||
pixel_values = None
|
||||
|
||||
text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
|
||||
|
||||
if pixel_values is not None and text_inputs is not None:
|
||||
text_inputs["pixel_values"] = pixel_values["pixel_values"]
|
||||
return text_inputs
|
||||
elif pixel_values is not None:
|
||||
return pixel_values
|
||||
else:
|
||||
return text_inputs
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's decode.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
|
||||
|
||||
AutoProcessor.register("BagelProcessor", BagelProcessor)
|
||||
438
vllm/transformers_utils/processors/deepseek_ocr.py
Normal file
438
vllm/transformers_utils/processors/deepseek_ocr.py
Normal file
@@ -0,0 +1,438 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.py
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image, ImageOps
|
||||
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
# TODO(Isotr0py): change modes for variants
|
||||
# see: https://github.com/deepseek-ai/DeepSeek-OCR/blob/8cf003d38821fa1b19c73da3bd1b0dc262ea8136/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py#L1-L6
|
||||
# Tiny: base_size = 512, image_size = 512, crop_mode = False
|
||||
# Small: base_size = 640, image_size = 640, crop_mode = False
|
||||
# Base: base_size = 1024, image_size = 1024, crop_mode = False
|
||||
# Large: base_size = 1280, image_size = 1280, crop_mode = False
|
||||
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
|
||||
BASE_SIZE = 1024
|
||||
IMAGE_SIZE = 640
|
||||
CROP_MODE = True
|
||||
|
||||
# TODO(Isotr0py): Expose as mm_kwargs
|
||||
MIN_CROPS = 2
|
||||
MAX_CROPS = 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
|
||||
|
||||
|
||||
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
||||
best_ratio_diff = float("inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
for ratio in target_ratios:
|
||||
target_aspect_ratio = ratio[0] / ratio[1]
|
||||
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
||||
if ratio_diff < best_ratio_diff:
|
||||
best_ratio_diff = ratio_diff
|
||||
best_ratio = ratio
|
||||
elif ratio_diff == best_ratio_diff:
|
||||
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
||||
best_ratio = ratio
|
||||
return best_ratio
|
||||
|
||||
|
||||
def calculate_aspect_ratios(
|
||||
min_num: int = MIN_CROPS, max_num: int = MAX_CROPS
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios: set[tuple[int, int]] = set(
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if i * j <= max_num and i * j >= min_num
|
||||
)
|
||||
sorted_target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
return sorted_target_ratios
|
||||
|
||||
|
||||
def count_tiles(
|
||||
orig_width,
|
||||
orig_height,
|
||||
min_num=MIN_CROPS,
|
||||
max_num=MAX_CROPS,
|
||||
image_size=640,
|
||||
use_thumbnail=False,
|
||||
):
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# calculate the existing image aspect ratio
|
||||
target_ratios = calculate_aspect_ratios(min_num, max_num)
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio, target_ratios, orig_width, orig_height, image_size
|
||||
)
|
||||
|
||||
return target_aspect_ratio
|
||||
|
||||
|
||||
def dynamic_preprocess(
|
||||
image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
|
||||
):
|
||||
orig_width, orig_height = image.size
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# calculate the existing image aspect ratio
|
||||
target_ratios = calculate_aspect_ratios(min_num, max_num)
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio, target_ratios, orig_width, orig_height, image_size
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
assert len(processed_images) == blocks
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
return processed_images, target_aspect_ratio
|
||||
|
||||
|
||||
class ImageTransform:
|
||||
def __init__(
|
||||
self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
):
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
self.normalize = normalize
|
||||
|
||||
transform_pipelines = [T.ToTensor()]
|
||||
|
||||
if normalize:
|
||||
transform_pipelines.append(T.Normalize(mean, std))
|
||||
|
||||
self.transform = T.Compose(transform_pipelines)
|
||||
|
||||
def __call__(self, pil_img: Image.Image):
|
||||
x = self.transform(pil_img)
|
||||
return x
|
||||
|
||||
|
||||
class DeepseekOCRProcessor(ProcessorMixin):
|
||||
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
|
||||
attributes = ["tokenizer"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: LlamaTokenizerFast,
|
||||
patch_size: int = 16,
|
||||
downsample_ratio: int = 4,
|
||||
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
image_token: str = "<image>",
|
||||
pad_token: str = "<|▁pad▁|>",
|
||||
add_special_token: bool = False,
|
||||
sft_format: str = "deepseek",
|
||||
mask_prompt: bool = True,
|
||||
ignore_id: int = -100,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_size = IMAGE_SIZE
|
||||
self.base_size = BASE_SIZE
|
||||
self.patch_size = 16
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.normalize = normalize
|
||||
self.downsample_ratio = 4
|
||||
|
||||
self.image_transform = ImageTransform(
|
||||
mean=image_mean, std=image_std, normalize=normalize
|
||||
)
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer.padding_side = "left" # must set this,padding side with make a difference in batch inference # noqa: E501
|
||||
|
||||
# add the pad_token as special token to use 'tokenizer.pad_token'
|
||||
# and 'tokenizer.pad_token_id'
|
||||
if self.tokenizer.pad_token is None:
|
||||
self.tokenizer.add_special_tokens({"pad_token": pad_token})
|
||||
|
||||
# add image token
|
||||
self.image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
self.image_token = image_token
|
||||
self.pad_token = pad_token
|
||||
self.add_special_token = add_special_token
|
||||
self.sft_format = sft_format
|
||||
self.mask_prompt = mask_prompt
|
||||
self.ignore_id = ignore_id
|
||||
|
||||
super().__init__(
|
||||
tokenizer,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def bos_id(self):
|
||||
return self.tokenizer.bos_token_id
|
||||
|
||||
@property
|
||||
def eos_id(self):
|
||||
return self.tokenizer.eos_token_id
|
||||
|
||||
@property
|
||||
def pad_id(self):
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
def encode(self, text: str, bos: bool = True, eos: bool = False):
|
||||
t = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
if bos:
|
||||
t = [self.bos_id] + t
|
||||
if eos:
|
||||
t = t + [self.eos_id]
|
||||
return t
|
||||
|
||||
def decode(self, t: list[int], **kwargs) -> str:
|
||||
return self.tokenizer.decode(t, **kwargs)
|
||||
|
||||
def process_one(
|
||||
self,
|
||||
prompt: str,
|
||||
images: list[Image.Image],
|
||||
crop_mode: bool = CROP_MODE,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
prompt (str): the formatted prompt;
|
||||
images (List[ImageType]): the list of images;
|
||||
crop_mode (bool): if True, then crop the image;
|
||||
|
||||
Returns:
|
||||
outputs (BaseProcessorOutput): the output of the processor,
|
||||
- input_ids (torch.LongTensor): [N + image tokens]
|
||||
- target_ids (torch.LongTensor): [N + image tokens]
|
||||
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
|
||||
- image_id (int): the id of the image token
|
||||
- num_image_tokens (List[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
assert prompt is not None and images is not None, (
|
||||
"prompt and images must be used at the same time."
|
||||
)
|
||||
|
||||
sft_format = prompt
|
||||
|
||||
(
|
||||
input_ids,
|
||||
pixel_values,
|
||||
images_crop,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
_,
|
||||
) = self.tokenize_with_images(
|
||||
conversation=sft_format,
|
||||
images=images,
|
||||
bos=True,
|
||||
eos=True,
|
||||
cropping=crop_mode,
|
||||
)
|
||||
|
||||
prepare = BatchFeature(
|
||||
data=dict(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
images_crop=images_crop,
|
||||
images_seq_mask=images_seq_mask,
|
||||
images_spatial_crop=images_spatial_crop,
|
||||
num_image_tokens=num_image_tokens,
|
||||
),
|
||||
tensor_type="pt",
|
||||
)
|
||||
return prepare
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
*,
|
||||
prompt: str,
|
||||
images: list[Image.Image],
|
||||
crop_mode: bool = CROP_MODE,
|
||||
**kwargs,
|
||||
):
|
||||
prepare = self.process_one(
|
||||
prompt=prompt,
|
||||
images=images,
|
||||
crop_mode=crop_mode,
|
||||
)
|
||||
|
||||
return prepare
|
||||
|
||||
def tokenize_with_images(
|
||||
self,
|
||||
conversation: str,
|
||||
images: list[Image.Image],
|
||||
bos: bool = True,
|
||||
eos: bool = True,
|
||||
cropping: bool = True,
|
||||
):
|
||||
"""Tokenize text with <image> tags."""
|
||||
|
||||
assert conversation.count(self.image_token) == len(images)
|
||||
text_splits = conversation.split(self.image_token)
|
||||
images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
)
|
||||
image_shapes = []
|
||||
num_image_tokens = []
|
||||
tokenized_str = []
|
||||
for text_sep, image in zip(text_splits, images):
|
||||
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
image_shapes.append(image.size)
|
||||
|
||||
images_crop_raw = []
|
||||
if image.size[0] <= 640 and image.size[1] <= 640:
|
||||
crop_ratio = [1, 1]
|
||||
elif cropping:
|
||||
images_crop_raw, crop_ratio = dynamic_preprocess(
|
||||
image, image_size=IMAGE_SIZE
|
||||
)
|
||||
else:
|
||||
crop_ratio = [1, 1]
|
||||
|
||||
if self.image_size <= 640 and not cropping:
|
||||
image = image.resize((self.image_size, self.image_size))
|
||||
|
||||
global_view = ImageOps.pad(
|
||||
image,
|
||||
(self.base_size, self.base_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||
)
|
||||
images_list.append(self.image_transform(global_view))
|
||||
|
||||
num_width_tiles, num_height_tiles = crop_ratio
|
||||
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
||||
|
||||
if num_width_tiles > 1 or num_height_tiles > 1:
|
||||
for cropped_image in images_crop_raw:
|
||||
images_crop_list.append(self.image_transform(cropped_image))
|
||||
|
||||
num_queries = math.ceil(
|
||||
(self.image_size // self.patch_size) / self.downsample_ratio
|
||||
)
|
||||
num_queries_base = math.ceil(
|
||||
(self.base_size // self.patch_size) / self.downsample_ratio
|
||||
)
|
||||
|
||||
tokenized_image = (
|
||||
[self.image_token_id] * num_queries_base + [self.image_token_id]
|
||||
) * num_queries_base
|
||||
tokenized_image += [self.image_token_id]
|
||||
if num_width_tiles > 1 or num_height_tiles > 1:
|
||||
local_row = [self.image_token_id] * (num_queries * num_width_tiles + 1)
|
||||
tokenized_image += local_row * (num_queries * num_height_tiles)
|
||||
tokenized_str += tokenized_image
|
||||
images_seq_mask += [True] * len(tokenized_image)
|
||||
num_image_tokens.append(len(tokenized_image))
|
||||
|
||||
"""process the last text split"""
|
||||
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
"""add the bos and eos tokens"""
|
||||
if bos:
|
||||
tokenized_str = [self.bos_id] + tokenized_str
|
||||
images_seq_mask = [False] + images_seq_mask
|
||||
if eos:
|
||||
tokenized_str = tokenized_str + [self.eos_id]
|
||||
images_seq_mask = images_seq_mask + [False]
|
||||
|
||||
assert len(tokenized_str) == len(images_seq_mask), (
|
||||
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} "
|
||||
f"is not equal to images_seq_mask's length {len(images_seq_mask)}."
|
||||
)
|
||||
|
||||
masked_tokenized_str = []
|
||||
for token_index in tokenized_str:
|
||||
if token_index != self.image_token_id:
|
||||
masked_tokenized_str.append(token_index)
|
||||
else:
|
||||
masked_tokenized_str.append(self.ignore_id)
|
||||
|
||||
assert (
|
||||
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
|
||||
), (
|
||||
f"tokenized_str's length {len(tokenized_str)}, "
|
||||
f"input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"images_seq_mask's length {len(images_seq_mask)}, are not equal."
|
||||
)
|
||||
|
||||
input_ids = torch.LongTensor(tokenized_str)
|
||||
target_ids = torch.LongTensor(masked_tokenized_str)
|
||||
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
||||
|
||||
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
||||
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
|
||||
self.ignore_id
|
||||
)
|
||||
input_ids[input_ids < 0] = self.pad_id
|
||||
|
||||
# Remove the ending eos token
|
||||
assert input_ids[-1] == self.eos_id
|
||||
input_ids = input_ids[:-1]
|
||||
target_ids = target_ids[:-1]
|
||||
images_seq_mask = images_seq_mask[:-1]
|
||||
|
||||
if len(images_list) == 0:
|
||||
pixel_values = torch.zeros((0, 3, self.base_size, self.base_size))
|
||||
images_spatial_crop = torch.zeros((0, 2), dtype=torch.long)
|
||||
images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
|
||||
else:
|
||||
pixel_values = torch.stack(images_list, dim=0)
|
||||
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
|
||||
if images_crop_list:
|
||||
images_crop = torch.stack(images_crop_list, dim=0)
|
||||
else:
|
||||
images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
|
||||
|
||||
input_ids = input_ids.unsqueeze(0)
|
||||
|
||||
return (
|
||||
input_ids,
|
||||
pixel_values,
|
||||
images_crop,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
image_shapes,
|
||||
)
|
||||
|
||||
|
||||
AutoProcessor.register("DeepseekOCRProcessor", DeepseekOCRProcessor)
|
||||
406
vllm/transformers_utils/processors/deepseek_vl2.py
Normal file
406
vllm/transformers_utils/processors/deepseek_vl2.py
Normal file
@@ -0,0 +1,406 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
||||
# Copyright (c) 2023-2024 DeepSeek.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# this software and associated documentation files (the "Software"), to deal in
|
||||
# the Software without restriction, including without limitation the rights to
|
||||
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
# the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
# subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image, ImageOps
|
||||
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class ImageTransform:
|
||||
def __init__(
|
||||
self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
):
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
self.normalize = normalize
|
||||
|
||||
transform_pipelines = [T.ToTensor()]
|
||||
|
||||
if normalize:
|
||||
transform_pipelines.append(T.Normalize(mean, std))
|
||||
|
||||
self.transform = T.Compose(transform_pipelines)
|
||||
|
||||
def __call__(self, pil_img: Image.Image):
|
||||
x = self.transform(pil_img)
|
||||
return x
|
||||
|
||||
|
||||
class DeepseekVLV2Processor(ProcessorMixin):
|
||||
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
|
||||
attributes = ["tokenizer"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: LlamaTokenizerFast,
|
||||
candidate_resolutions: tuple[tuple[int, int]],
|
||||
patch_size: int,
|
||||
downsample_ratio: int,
|
||||
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
image_token: str = "<image>",
|
||||
pad_token: str = "<|▁pad▁|>",
|
||||
add_special_token: bool = False,
|
||||
sft_format: str = "deepseek",
|
||||
mask_prompt: bool = True,
|
||||
ignore_id: int = -100,
|
||||
**kwargs,
|
||||
):
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.image_size = candidate_resolutions[0][0]
|
||||
self.patch_size = patch_size
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.normalize = normalize
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
self.image_transform = ImageTransform(
|
||||
mean=image_mean, std=image_std, normalize=normalize
|
||||
)
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer.padding_side = "left" # must set this,padding side with make a difference in batch inference
|
||||
|
||||
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
||||
if tokenizer.pad_token is None:
|
||||
self.tokenizer.add_special_tokens({"pad_token": pad_token})
|
||||
|
||||
# add image token
|
||||
image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
if image_token_id is None:
|
||||
special_tokens = [image_token]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
self.image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
|
||||
# add five special tokens for grounding-related tasks
|
||||
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
||||
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
# add special tokens for SFT data
|
||||
special_tokens = ["<|User|>", "<|Assistant|>"]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
self.image_token = image_token
|
||||
self.pad_token = pad_token
|
||||
self.add_special_token = add_special_token
|
||||
self.sft_format = sft_format
|
||||
self.mask_prompt = mask_prompt
|
||||
self.ignore_id = ignore_id
|
||||
|
||||
super().__init__(
|
||||
tokenizer,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def select_best_resolution(self, image_size):
|
||||
# used for cropping
|
||||
original_width, original_height = image_size
|
||||
best_fit = None
|
||||
max_effective_resolution = 0
|
||||
min_wasted_resolution = float("inf")
|
||||
|
||||
for width, height in self.candidate_resolutions:
|
||||
scale = min(width / original_width, height / original_height)
|
||||
downscaled_width, downscaled_height = (
|
||||
int(original_width * scale),
|
||||
int(original_height * scale),
|
||||
)
|
||||
effective_resolution = min(
|
||||
downscaled_width * downscaled_height, original_width * original_height
|
||||
)
|
||||
wasted_resolution = (width * height) - effective_resolution
|
||||
|
||||
if effective_resolution > max_effective_resolution or (
|
||||
effective_resolution == max_effective_resolution
|
||||
and wasted_resolution < min_wasted_resolution
|
||||
):
|
||||
max_effective_resolution = effective_resolution
|
||||
min_wasted_resolution = wasted_resolution
|
||||
best_fit = (width, height)
|
||||
|
||||
return best_fit
|
||||
|
||||
@property
|
||||
def bos_id(self):
|
||||
return self.tokenizer.bos_token_id
|
||||
|
||||
@property
|
||||
def eos_id(self):
|
||||
return self.tokenizer.eos_token_id
|
||||
|
||||
@property
|
||||
def pad_id(self):
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
def encode(self, text: str, bos: bool = True, eos: bool = False):
|
||||
t = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if bos:
|
||||
t = [self.bos_id] + t
|
||||
if eos:
|
||||
t = t + [self.eos_id]
|
||||
|
||||
return t
|
||||
|
||||
def decode(self, t: list[int], **kwargs) -> str:
|
||||
return self.tokenizer.decode(t, **kwargs)
|
||||
|
||||
def process_one(
|
||||
self,
|
||||
prompt: str,
|
||||
images: list[Image.Image],
|
||||
inference_mode: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
prompt (str): the formatted prompt;
|
||||
images (list[ImageType]): the list of images;
|
||||
inference_mode (bool): if True, then remove the last eos token;
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
outputs (BaseProcessorOutput): the output of the processor,
|
||||
- input_ids (torch.LongTensor): [N + image tokens]
|
||||
- target_ids (torch.LongTensor): [N + image tokens]
|
||||
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
|
||||
- image_id (int): the id of the image token
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
assert prompt is not None and images is not None, (
|
||||
"prompt and images must be used at the same time."
|
||||
)
|
||||
|
||||
sft_format = prompt
|
||||
(
|
||||
tokenized_str,
|
||||
images_list,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
) = self.tokenize_with_images(
|
||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
|
||||
)
|
||||
masked_tokenized_str = []
|
||||
for token_index in tokenized_str:
|
||||
if token_index != self.image_token_id:
|
||||
masked_tokenized_str.append(token_index)
|
||||
else:
|
||||
masked_tokenized_str.append(self.ignore_id)
|
||||
|
||||
assert (
|
||||
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
|
||||
), (
|
||||
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
|
||||
)
|
||||
|
||||
input_ids = torch.LongTensor(tokenized_str)
|
||||
target_ids = torch.LongTensor(masked_tokenized_str)
|
||||
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
||||
|
||||
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
||||
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
|
||||
self.ignore_id
|
||||
)
|
||||
input_ids[input_ids < 0] = self.pad_id
|
||||
|
||||
if inference_mode:
|
||||
# Remove the ending eos token
|
||||
assert input_ids[-1] == self.eos_id
|
||||
input_ids = input_ids[:-1]
|
||||
target_ids = target_ids[:-1]
|
||||
images_seq_mask = images_seq_mask[:-1]
|
||||
|
||||
if len(images_list) == 0:
|
||||
pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
|
||||
images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
|
||||
else:
|
||||
pixel_values = torch.stack(images_list, dim=0)
|
||||
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
|
||||
|
||||
input_ids = input_ids.unsqueeze(0)
|
||||
|
||||
prepare = BatchFeature(
|
||||
data=dict(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
images_seq_mask=images_seq_mask,
|
||||
images_spatial_crop=images_spatial_crop,
|
||||
num_image_tokens=num_image_tokens,
|
||||
),
|
||||
tensor_type="pt",
|
||||
)
|
||||
return prepare
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
*,
|
||||
text: str,
|
||||
images: list[Image.Image],
|
||||
inference_mode: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
text (str): the formatted prompt;
|
||||
images (list[ImageType]): the list of images;
|
||||
inference_mode (bool): if True, then remove the last eos token;
|
||||
**kwargs:
|
||||
|
||||
Returns:
|
||||
outputs (BaseProcessorOutput): the output of the processor,
|
||||
- input_ids (torch.LongTensor): [N + image tokens]
|
||||
- images (torch.FloatTensor): [n_images, 3, H, W]
|
||||
- image_id (int): the id of the image token
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
prepare = self.process_one(
|
||||
prompt=text,
|
||||
images=images,
|
||||
inference_mode=inference_mode,
|
||||
)
|
||||
|
||||
return prepare
|
||||
|
||||
def tokenize_with_images(
|
||||
self,
|
||||
conversation: str,
|
||||
images: list[Image.Image],
|
||||
bos: bool = True,
|
||||
eos: bool = True,
|
||||
cropping: bool = True,
|
||||
):
|
||||
"""Tokenize text with <image> tags."""
|
||||
assert conversation.count(self.image_token) == len(images)
|
||||
text_splits = conversation.split(self.image_token)
|
||||
images_list, images_seq_mask, images_spatial_crop = [], [], []
|
||||
num_image_tokens = []
|
||||
tokenized_str = []
|
||||
for text_sep, image in zip(text_splits, images):
|
||||
"""encode text_sep"""
|
||||
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
"""select best resolution for anyres"""
|
||||
if cropping:
|
||||
best_width, best_height = self.select_best_resolution(image.size)
|
||||
else:
|
||||
best_width, best_height = self.image_size, self.image_size
|
||||
|
||||
"""process the global view"""
|
||||
global_view = ImageOps.pad(
|
||||
image,
|
||||
(self.image_size, self.image_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||
)
|
||||
images_list.append(self.image_transform(global_view))
|
||||
|
||||
"""process the local views"""
|
||||
local_view = ImageOps.pad(
|
||||
image,
|
||||
(best_width, best_height),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||
)
|
||||
for i in range(0, best_height, self.image_size):
|
||||
for j in range(0, best_width, self.image_size):
|
||||
images_list.append(
|
||||
self.image_transform(
|
||||
local_view.crop(
|
||||
(j, i, j + self.image_size, i + self.image_size)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
"""record height / width crop num"""
|
||||
num_width_tiles, num_height_tiles = (
|
||||
best_width // self.image_size,
|
||||
best_height // self.image_size,
|
||||
)
|
||||
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
||||
|
||||
"""add image tokens"""
|
||||
h = w = math.ceil(
|
||||
(self.image_size // self.patch_size) / self.downsample_ratio
|
||||
)
|
||||
# global views tokens h * (w + 1), 1 is for line separator
|
||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||
# add a separator between global and local views
|
||||
tokenized_image += [self.image_token_id]
|
||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += (
|
||||
[self.image_token_id]
|
||||
* (num_height_tiles * h)
|
||||
* (num_width_tiles * w + 1)
|
||||
)
|
||||
|
||||
tokenized_str += tokenized_image
|
||||
images_seq_mask += [True] * len(tokenized_image)
|
||||
num_image_tokens.append(len(tokenized_image))
|
||||
|
||||
"""process the last text split"""
|
||||
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
"""add the bos and eos tokens"""
|
||||
if bos:
|
||||
tokenized_str = [self.bos_id] + tokenized_str
|
||||
images_seq_mask = [False] + images_seq_mask
|
||||
if eos:
|
||||
tokenized_str = tokenized_str + [self.eos_id]
|
||||
images_seq_mask = images_seq_mask + [False]
|
||||
|
||||
assert len(tokenized_str) == len(images_seq_mask), (
|
||||
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||
)
|
||||
|
||||
return (
|
||||
tokenized_str,
|
||||
images_list,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
)
|
||||
|
||||
|
||||
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
||||
233
vllm/transformers_utils/processors/hunyuan_vl.py
Normal file
233
vllm/transformers_utils/processors/hunyuan_vl.py
Normal file
@@ -0,0 +1,233 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoProcessor
|
||||
from transformers.feature_extraction_utils import BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from transformers.video_utils import VideoInput
|
||||
|
||||
|
||||
class HunYuanVLProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template"]
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer" # ("AutoTokenizer", None)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
video_processor=None,
|
||||
chat_template=None,
|
||||
**kwargs,
|
||||
):
|
||||
# TODO Fix the init
|
||||
self.tokenizer = tokenizer
|
||||
self.image_token_id = 120120 # self.tokenizer.image_token_id
|
||||
self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
|
||||
self.im_start_token_id = 120118 # self.tokenizer.im_start_id
|
||||
self.im_start_token = self.tokenizer.convert_ids_to_tokens(
|
||||
self.im_start_token_id
|
||||
)
|
||||
self.im_end_token_id = 120119 # self.tokenizer.im_end_id
|
||||
self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
|
||||
self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
|
||||
self.tokenizer.vocab_size - 1
|
||||
)
|
||||
self.pad_id = 120002 # self.tokenizer.pad_token_id
|
||||
|
||||
super().__init__(
|
||||
image_processor, tokenizer, video_processor, chat_template=chat_template
|
||||
)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: TextInput
|
||||
| PreTokenizedInput
|
||||
| list[TextInput]
|
||||
| list[PreTokenizedInput] = None,
|
||||
videos: VideoInput = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
image_inputs = {}
|
||||
if images is not None:
|
||||
image_inputs = self.image_processor(images=images)
|
||||
image_grid_thw = image_inputs["image_grid_thw"]
|
||||
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
text = text.copy() # below lines change text in-place
|
||||
|
||||
image_tokens_cumsum = [0]
|
||||
if images is not None:
|
||||
index = 0
|
||||
for i in range(len(text)):
|
||||
while self.image_token in text[i]:
|
||||
grid_h, grid_w = image_grid_thw[index][-2:]
|
||||
patch_h = grid_h // self.image_processor.merge_size
|
||||
patch_w = grid_w // self.image_processor.merge_size
|
||||
num_image_tokens = patch_h * (patch_w + 1) + 2
|
||||
image_tokens_cumsum.append(
|
||||
image_tokens_cumsum[-1] + num_image_tokens
|
||||
)
|
||||
# text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
|
||||
text[i] = text[i].replace(
|
||||
self.image_token, self.placeholder_token * num_image_tokens, 1
|
||||
)
|
||||
index += 1
|
||||
text[i] = text[i].replace(self.placeholder_token, self.image_token)
|
||||
# text[i] = self.tokenizer.bos_token + text[i]
|
||||
|
||||
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
|
||||
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||
|
||||
input_ids = text_inputs["input_ids"]
|
||||
position_ids = torch.arange(len(input_ids[0]))
|
||||
position_ids_w = torch.arange(len(input_ids[0]))
|
||||
position_ids_h = torch.arange(len(input_ids[0]))
|
||||
position_ids_t = torch.arange(len(input_ids[0]))
|
||||
|
||||
if images is not None:
|
||||
image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
|
||||
0
|
||||
]
|
||||
for i in range(len(image_grid_thw)):
|
||||
grid_h, grid_w = image_grid_thw[i][-2:]
|
||||
patch_h = grid_h // self.image_processor.merge_size
|
||||
patch_w = grid_w // self.image_processor.merge_size
|
||||
start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
|
||||
replace_num = (patch_w + 1) * patch_h
|
||||
position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
|
||||
list(range(patch_w + 1)) * patch_h, dtype=torch.int64
|
||||
)
|
||||
patch_h_list = []
|
||||
for h in range(patch_h):
|
||||
patch_h_list += [h] * (patch_w + 1)
|
||||
position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
|
||||
patch_h_list, dtype=torch.int64
|
||||
)
|
||||
position_ids_t[start_pos : start_pos + replace_num] = 0
|
||||
|
||||
position_ids = torch.stack(
|
||||
[position_ids, position_ids_w, position_ids_h, position_ids_t]
|
||||
).unsqueeze(0)
|
||||
text_inputs["position_ids"] = position_ids
|
||||
|
||||
attention_mask = input_ids.ne(self.pad_id)
|
||||
text_inputs["attention_mask"] = attention_mask
|
||||
text_inputs["imgs_pos"] = [self.get_imgs_pos(e) for e in input_ids]
|
||||
# image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
|
||||
|
||||
return_tensors = kwargs.pop("return_tensors", None)
|
||||
return BatchFeature(
|
||||
data={**text_inputs, **image_inputs},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
def post_process_image_text_to_text(
|
||||
self,
|
||||
generated_outputs,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
**kwargs,
|
||||
):
|
||||
assert 0
|
||||
|
||||
def apply_chat_template(self, *args, **kwargs):
|
||||
token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
|
||||
return token_ids
|
||||
|
||||
def get_imgs_pos(self, doc_ids):
|
||||
doc_ids = np.array(doc_ids, dtype=np.int64)
|
||||
img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
|
||||
img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
|
||||
imgs_pos = np.concatenate(
|
||||
(
|
||||
np.reshape(img_begin_index + 1, (-1, 1)),
|
||||
np.reshape(img_end_index, (-1, 1)),
|
||||
),
|
||||
axis=-1,
|
||||
).tolist()
|
||||
return imgs_pos
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
|
||||
|
||||
def split_image_into_patch_blocks(
|
||||
pixel_values: torch.Tensor, # shape: [batch_size, 3, H, W]
|
||||
patch_size: int = 16, # e.g. 16
|
||||
adaptor_patch_div: int = 4, # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Split the input image tensor (supporting batch) into large patches of size `patch_size`,
|
||||
and then further divide each large patch into smaller regions of size
|
||||
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
|
||||
Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
|
||||
The final output contains all such small region tensors.
|
||||
|
||||
Args:
|
||||
pixel_values: Input image tensor of shape [batch_size, 3, H, W].
|
||||
patch_size: Size of the large patch, e.g., 16.
|
||||
adaptor_patch_div: Each large patch is divided into
|
||||
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
|
||||
smaller regions.
|
||||
|
||||
Returns:
|
||||
patches: A tensor of shape [N, 3, patch_size, patch_size],
|
||||
where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
|
||||
Each element in the batch corresponds to one small image region.
|
||||
""" # noqa: E501
|
||||
batch_size, channels, height, width = pixel_values.shape
|
||||
assert channels == 3, "Pixel values must have 3 channels in dim=1"
|
||||
assert height % patch_size == 0 and width % patch_size == 0, (
|
||||
"H and W must be divisible by patch_size"
|
||||
)
|
||||
|
||||
patch_height_num = height // patch_size
|
||||
patch_width_num = width // patch_size
|
||||
|
||||
# Reshape to [B, 3, ph, ps, pw, ps]
|
||||
img = pixel_values.reshape(
|
||||
batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
|
||||
)
|
||||
|
||||
# Further split each psxps patch into (ps//aps)x(ps//aps) small regions
|
||||
img = img.reshape(
|
||||
batch_size,
|
||||
3,
|
||||
patch_height_num,
|
||||
patch_size // adaptor_patch_div, # ps // aps
|
||||
adaptor_patch_div,
|
||||
patch_width_num,
|
||||
patch_size // adaptor_patch_div, # ps // aps
|
||||
adaptor_patch_div,
|
||||
)
|
||||
|
||||
# Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
|
||||
img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
|
||||
|
||||
# Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
|
||||
patches = img.reshape(-1, 3, patch_size, patch_size)
|
||||
|
||||
return patches
|
||||
|
||||
|
||||
AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)
|
||||
477
vllm/transformers_utils/processors/hunyuan_vl_image.py
Normal file
477
vllm/transformers_utils/processors/hunyuan_vl_image.py
Normal file
@@ -0,0 +1,477 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
|
||||
"""Image processor class for HunYuanVL."""
|
||||
|
||||
# isort conflicts with ruff for transformers imports
|
||||
# isort: skip_file
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import torchvision.transforms as transforms
|
||||
from transformers import AutoImageProcessor
|
||||
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
|
||||
from transformers.image_transforms import (
|
||||
convert_to_rgb,
|
||||
)
|
||||
from transformers.image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
make_flat_list_of_images,
|
||||
make_list_of_images,
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from transformers.utils import TensorType, logging
|
||||
from transformers.video_utils import VideoInput, make_batched_videos
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 16,
|
||||
min_pixels: int = 512 * 512,
|
||||
max_pixels: int = 2048 * 2048,
|
||||
):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
|
||||
"""
|
||||
if max(height, width) / min(height, width) > 200:
|
||||
raise ValueError(
|
||||
"absolute aspect ratio must be smaller than 200, got "
|
||||
f"{max(height, width) / min(height, width)}"
|
||||
)
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = max(factor, math.floor(height / beta / factor) * factor)
|
||||
w_bar = max(factor, math.floor(width / beta / factor) * factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
class HunYuanVLImageProcessor(BaseImageProcessor):
|
||||
model_input_names = [
|
||||
"pixel_values",
|
||||
"image_grid_thw",
|
||||
"pixel_values_videos",
|
||||
"video_grid_thw",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: dict[str, int] | None = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: int | float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: float | list[float] | None = None,
|
||||
image_std: float | list[float] | None = None,
|
||||
do_convert_rgb: bool = True,
|
||||
min_pixels: int | None = None,
|
||||
max_pixels: int | None = None,
|
||||
patch_size: int = 16,
|
||||
temporal_patch_size: int = 2,
|
||||
merge_size: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if size is not None and (
|
||||
"shortest_edge" not in size or "longest_edge" not in size
|
||||
):
|
||||
raise ValueError(
|
||||
"size must contain 'shortest_edge' and 'longest_edge' keys."
|
||||
)
|
||||
else:
|
||||
size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
|
||||
# backward compatibility: override size with min_pixels and max_pixels
|
||||
# if they are provided.
|
||||
if min_pixels is not None:
|
||||
size["shortest_edge"] = min_pixels
|
||||
if max_pixels is not None:
|
||||
size["longest_edge"] = max_pixels
|
||||
self.min_pixels = size["shortest_edge"]
|
||||
self.max_pixels = size["longest_edge"]
|
||||
self.size = size
|
||||
|
||||
self.do_resize = do_resize
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.merge_size = merge_size
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
# hard-code
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: ImageInput | VideoInput,
|
||||
do_resize: bool | None = None,
|
||||
size: dict[str, int] | None = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool | None = None,
|
||||
rescale_factor: float | None = None,
|
||||
do_normalize: bool | None = None,
|
||||
image_mean: float | list[float] | None = None,
|
||||
image_std: float | list[float] | None = None,
|
||||
patch_size: int = 16,
|
||||
temporal_patch_size: int = 2,
|
||||
merge_size: int = 2,
|
||||
do_convert_rgb: bool | None = None,
|
||||
data_format: ChannelDimension | None = ChannelDimension.FIRST,
|
||||
input_data_format: str | ChannelDimension | None = None,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
""" # noqa: E501
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
width, height = images[0].width, images[0].height
|
||||
resized_width, resized_height = width, height
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if do_resize:
|
||||
resized_height, resized_width = smart_resize(
|
||||
height=height,
|
||||
width=width,
|
||||
factor=patch_size * merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
image = image.resize((resized_width, resized_height))
|
||||
|
||||
if do_normalize:
|
||||
image = transforms.Compose(
|
||||
[
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(self.image_mean, self.image_std),
|
||||
]
|
||||
)(image)
|
||||
processed_images.append(image)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // temporal_patch_size
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
patches = patches.reshape(
|
||||
1,
|
||||
channel,
|
||||
grid_h // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
grid_w // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
)
|
||||
patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
|
||||
flatten_patches = patches.reshape(
|
||||
1 * grid_h * grid_w, channel * patch_size * patch_size
|
||||
)
|
||||
|
||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: VideoInput = None,
|
||||
do_resize: bool | None = None,
|
||||
size: dict[str, int] | None = None,
|
||||
min_pixels: int | None = None,
|
||||
max_pixels: int | None = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool | None = None,
|
||||
rescale_factor: float | None = None,
|
||||
do_normalize: bool | None = None,
|
||||
image_mean: float | list[float] | None = None,
|
||||
image_std: float | list[float] | None = None,
|
||||
patch_size: int | None = None,
|
||||
temporal_patch_size: int | None = None,
|
||||
merge_size: int | None = None,
|
||||
do_convert_rgb: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
data_format: ChannelDimension | None = ChannelDimension.FIRST,
|
||||
input_data_format: str | ChannelDimension | None = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
videos (`VideoInput`):
|
||||
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
||||
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
|
||||
The min pixels of the image to resize the image.
|
||||
max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
|
||||
The max pixels of the image to resize the image.
|
||||
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
|
||||
""" # noqa: E501
|
||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
|
||||
if size is not None:
|
||||
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||
raise ValueError(
|
||||
"size must contain 'shortest_edge' and 'longest_edge' keys."
|
||||
)
|
||||
min_pixels = size["shortest_edge"]
|
||||
elif min_pixels is not None and max_pixels is not None:
|
||||
# backward compatibility: override size with min_pixels and max_pixels
|
||||
# if they are provided.
|
||||
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
|
||||
else:
|
||||
size = {**self.size}
|
||||
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = (
|
||||
rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
)
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
temporal_patch_size = (
|
||||
temporal_patch_size
|
||||
if temporal_patch_size is not None
|
||||
else self.temporal_patch_size
|
||||
)
|
||||
merge_size = merge_size if merge_size is not None else self.merge_size
|
||||
do_convert_rgb = (
|
||||
do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
)
|
||||
|
||||
if images is not None:
|
||||
images = make_flat_list_of_images(images)
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
|
||||
validate_preprocess_arguments(
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
data = {}
|
||||
if images is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for image in images:
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
image,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
patch_size=patch_size,
|
||||
temporal_patch_size=temporal_patch_size,
|
||||
merge_size=merge_size,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(image_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
data.update(
|
||||
{"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
|
||||
)
|
||||
|
||||
# kept for BC only and should be removed after v5.0
|
||||
if videos is not None:
|
||||
logger.warning(
|
||||
"`HunYuanVLV1ImageProcessor` works only with image inputs "
|
||||
"and doesn't process videos anymore. "
|
||||
"This is a deprecated behavior and will be removed in v5.0. "
|
||||
"Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
|
||||
)
|
||||
videos = make_batched_videos(videos)
|
||||
pixel_values_videos, vision_grid_thws_videos = [], []
|
||||
for images in videos:
|
||||
patches, video_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
patch_size=patch_size,
|
||||
temporal_patch_size=temporal_patch_size,
|
||||
merge_size=merge_size,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
pixel_values_videos.extend(patches)
|
||||
vision_grid_thws_videos.append(video_grid_thw)
|
||||
data.update(
|
||||
{
|
||||
"pixel_values_videos": np.array(pixel_values_videos),
|
||||
"video_grid_thw": np.array(vision_grid_thws_videos),
|
||||
}
|
||||
)
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
||||
"""
|
||||
A utility that returns number of image patches for a given image size.
|
||||
|
||||
Args:
|
||||
height (`int`):
|
||||
Height of the input image.
|
||||
width (`int`):
|
||||
Width of the input image.
|
||||
images_kwargs (`dict`, *optional*):
|
||||
Any kwargs to override defaults of the image processor.
|
||||
Returns:
|
||||
`int`: Number of image patches per image.
|
||||
"""
|
||||
min_pixels = (
|
||||
images_kwargs["min_pixels"]
|
||||
if "min_pixels" in images_kwargs
|
||||
else self.size["shortest_edge"]
|
||||
)
|
||||
max_pixels = (
|
||||
images_kwargs["max_pixels"]
|
||||
if "max_pixels" in images_kwargs
|
||||
else self.size["longest_edge"]
|
||||
)
|
||||
patch_size = images_kwargs.get("patch_size", self.patch_size)
|
||||
merge_size = images_kwargs.get("merge_size", self.merge_size)
|
||||
|
||||
factor = patch_size * merge_size
|
||||
resized_height, resized_width = smart_resize(
|
||||
height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
|
||||
)
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
return grid_h * (grid_w + 1) + 2
|
||||
|
||||
|
||||
AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)
|
||||
453
vllm/transformers_utils/processors/ovis.py
Normal file
453
vllm/transformers_utils/processors/ovis.py
Normal file
@@ -0,0 +1,453 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
|
||||
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from functools import cached_property
|
||||
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
__all__ = ["OvisProcessor"]
|
||||
IGNORE_ID = -100
|
||||
|
||||
|
||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
"max_partition": 9,
|
||||
"covering_threshold": 0.9,
|
||||
"convert_to_rgb": True,
|
||||
"return_tensors": "pt",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class OvisProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
|
||||
[`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
|
||||
[`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
|
||||
Args:
|
||||
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
||||
in a chat into a tokenizable string.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
chat_template=None,
|
||||
image_pad_token=None,
|
||||
image_segment_len=255,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_token = "<image>"
|
||||
self.image_pad_token = image_pad_token
|
||||
self.image_segment_len = image_segment_len
|
||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||
|
||||
@cached_property
|
||||
def extra_special_tokens(self):
|
||||
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
|
||||
extra_special_tokens = {
|
||||
"image_token": -200,
|
||||
"image_atom": -300,
|
||||
"image_start": -301,
|
||||
"image_prefix": -302,
|
||||
"image_col_sep": -303,
|
||||
"image_row_sep": -304,
|
||||
"image_end": -305,
|
||||
"image_pad": image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: TextInput
|
||||
| PreTokenizedInput
|
||||
| list[TextInput]
|
||||
| list[PreTokenizedInput] = None,
|
||||
**kwargs: Unpack[OvisProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
||||
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
|
||||
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
||||
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
|
||||
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
OvisProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Process all images first
|
||||
image_features = {}
|
||||
if images is not None:
|
||||
processed_images = []
|
||||
image_placeholders_list = []
|
||||
grids = []
|
||||
|
||||
# Process each image
|
||||
for image in images if isinstance(images, list) else [images]:
|
||||
pixel_values, image_placeholders, grid = self.preprocess_image(
|
||||
image=image, **output_kwargs["images_kwargs"]
|
||||
)
|
||||
processed_images.append(pixel_values)
|
||||
image_placeholders_list.append(image_placeholders)
|
||||
grids.append(grid)
|
||||
|
||||
# assign all processed images
|
||||
if processed_images:
|
||||
image_features["image_placeholders"] = image_placeholders_list
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
tokenized_batched_text = self._tokenize_with_image_symbol(text)
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
replaced_ids_list = []
|
||||
idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
if (
|
||||
image_token_id in ids_tensor
|
||||
and "image_placeholders" in image_features
|
||||
):
|
||||
if idx < len(image_features["image_placeholders"]):
|
||||
# Converts in list for ease of use
|
||||
ids_list = ids_tensor.tolist()
|
||||
|
||||
new_ids = []
|
||||
|
||||
# replace placeholders
|
||||
for i, token_id in enumerate(ids_list):
|
||||
if token_id == image_token_id:
|
||||
placeholder_ids = image_features["image_placeholders"][
|
||||
idx
|
||||
]
|
||||
new_ids.extend(placeholder_ids)
|
||||
idx += 1
|
||||
else:
|
||||
new_ids.append(token_id)
|
||||
|
||||
# Converts back to tensors
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Mismatch between the images you provided and the number of placeholder present in the text"
|
||||
)
|
||||
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
|
||||
if replaced_ids_list:
|
||||
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
|
||||
else:
|
||||
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
|
||||
|
||||
# Create the output with text features
|
||||
output = BatchFeature(
|
||||
data={
|
||||
"input_ids": replaced_and_tokenized_ids,
|
||||
}
|
||||
)
|
||||
|
||||
# Add image features if present
|
||||
if image_features:
|
||||
output["pixel_values"] = processed_images
|
||||
output["grids"] = grids
|
||||
|
||||
return output
|
||||
|
||||
# If only images were provided
|
||||
return BatchFeature(data=image_features)
|
||||
|
||||
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
text_chunks = [
|
||||
self.tokenizer(chunk, add_special_tokens=False).input_ids
|
||||
for chunk in text.split(self.image_token)
|
||||
]
|
||||
token_ids = []
|
||||
num_chuck = len(text_chunks)
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
token_ids.extend(chunk)
|
||||
if i < num_chuck - 1:
|
||||
token_ids.append(self.get_token_value("image_token"))
|
||||
batch_token_ids.append(token_ids)
|
||||
return torch.tensor(batch_token_ids, dtype=torch.long)
|
||||
|
||||
def get_image_size(self):
|
||||
size = self.image_processor.size
|
||||
if "shortest_edge" in size:
|
||||
width = height = size["shortest_edge"]
|
||||
elif "height" in size and "width" in size:
|
||||
width = size["width"]
|
||||
height = size["height"]
|
||||
else:
|
||||
raise ValueError("Can't parse image size from image_processor config.")
|
||||
return height, width
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_image_indicators(self, grid):
|
||||
image_placeholders = [
|
||||
self.get_token_value("image_start"),
|
||||
self.get_token_value("image_atom"),
|
||||
self.get_token_value("image_prefix"),
|
||||
]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(self.get_token_value("image_atom"))
|
||||
if c < grid[1] - 1:
|
||||
image_placeholders.append(self.get_token_value("image_col_sep"))
|
||||
if r < grid[0] - 1:
|
||||
image_placeholders.append(self.get_token_value("image_row_sep"))
|
||||
image_placeholders.append(self.get_token_value("image_end"))
|
||||
return image_placeholders
|
||||
|
||||
def construct_image_placeholders(self, grid):
|
||||
image_placeholders = self.construct_image_indicators(grid)
|
||||
|
||||
image_atom_token_id = self.get_token_value("image_atom")
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value("image_pad")
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in image_placeholders:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend(
|
||||
[image_padding_token_id] * self.image_segment_len
|
||||
)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_image(
|
||||
self,
|
||||
image: PIL.Image.Image,
|
||||
max_partition,
|
||||
covering_threshold,
|
||||
convert_to_rgb,
|
||||
return_tensors,
|
||||
):
|
||||
def _preprocess(img: PIL.Image.Image, side):
|
||||
# first resize and preprocess
|
||||
w, h = img.size
|
||||
if w == h:
|
||||
new_width = new_height = side
|
||||
elif w > h:
|
||||
new_width = side
|
||||
new_height = int(h / w * new_width)
|
||||
else:
|
||||
new_height = side
|
||||
new_width = int(w / h * new_height)
|
||||
new_size = dict(height=new_height, width=new_width)
|
||||
pixel_values = self.image_processor.preprocess(
|
||||
img, size=new_size, return_tensors=return_tensors
|
||||
)["pixel_values"]
|
||||
|
||||
# then pad to square
|
||||
square_values = torch.zeros(
|
||||
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
|
||||
)
|
||||
new_height, new_width = pixel_values.shape[2:]
|
||||
if new_height == new_width:
|
||||
square_values[:, :, :, :] = pixel_values
|
||||
elif new_height > new_width:
|
||||
from_index = (side - new_width) // 2
|
||||
square_values[:, :, :, from_index : from_index + new_width] = (
|
||||
pixel_values
|
||||
)
|
||||
else:
|
||||
from_index = (side - new_height) // 2
|
||||
square_values[:, :, from_index : from_index + new_height, :] = (
|
||||
pixel_values
|
||||
)
|
||||
|
||||
return square_values
|
||||
|
||||
def _partition(img, grid) -> list[tuple[int, int, int, int]]:
|
||||
w, h = img.size
|
||||
row_height = h // grid[0]
|
||||
col_width = w // grid[1]
|
||||
|
||||
partition = []
|
||||
for row in range(grid[0]):
|
||||
for col in range(grid[1]):
|
||||
left = col * col_width
|
||||
upper = row * row_height
|
||||
right = w if col == grid[1] - 1 else (col + 1) * col_width
|
||||
lower = h if row == grid[0] - 1 else (row + 1) * row_height
|
||||
partition.append((left, upper, right, lower))
|
||||
|
||||
return partition
|
||||
|
||||
def _covering_area(left, upper, right, lower, side):
|
||||
w = right - left
|
||||
h = lower - upper
|
||||
w, h = max(w, h), min(w, h)
|
||||
if w > side:
|
||||
h = h / w * side
|
||||
w = side
|
||||
return w * h
|
||||
|
||||
def _get_best_grid(img, side):
|
||||
img_area = img.size[0] * img.size[1]
|
||||
|
||||
candidate_grids = []
|
||||
for i in range(1, max_partition + 1):
|
||||
for j in range(1, max_partition + 1):
|
||||
if i * j <= max_partition:
|
||||
candidate_grids.append((i, j))
|
||||
|
||||
all_grids = []
|
||||
good_grids = []
|
||||
for grid in candidate_grids:
|
||||
partition = _partition(img, grid)
|
||||
covering_ratio = (
|
||||
sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||
)
|
||||
assert covering_ratio <= 1.0
|
||||
all_grids.append((grid, covering_ratio))
|
||||
if covering_ratio > covering_threshold:
|
||||
good_grids.append((grid, covering_ratio))
|
||||
|
||||
if len(good_grids) > 0:
|
||||
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
|
||||
0
|
||||
]
|
||||
else:
|
||||
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
||||
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
||||
|
||||
if convert_to_rgb:
|
||||
image = convert_image_mode(image, "RGB")
|
||||
|
||||
sides = self.get_image_size()
|
||||
if sides[0] != sides[1]:
|
||||
raise ValueError("get_image_size() returns non-square size")
|
||||
side = sides[0]
|
||||
grid = _get_best_grid(image, side)
|
||||
partition = _partition(image, grid)
|
||||
crops = [image.crop(p) for p in partition]
|
||||
if len(crops) > 1:
|
||||
crops.insert(0, image)
|
||||
pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
|
||||
image_placeholders = self.construct_image_placeholders(grid)
|
||||
return torch.tensor(pixel_values), image_placeholders, torch.tensor(grid)
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
||||
refer to the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
||||
the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
def post_process_image_text_to_text(self, generated_outputs):
|
||||
"""
|
||||
Post-process the output of the model to decode the text.
|
||||
Args:
|
||||
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
||||
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
|
||||
or `(sequence_length,)`.
|
||||
Returns:
|
||||
`list[str]`: The decoded text.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(
|
||||
generated_outputs,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
names_from_processor = list(
|
||||
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
|
||||
)
|
||||
return names_from_processor + ["second_per_grid_ts"]
|
||||
|
||||
|
||||
AutoProcessor.register("OvisProcessor", OvisProcessor)
|
||||
468
vllm/transformers_utils/processors/ovis2_5.py
Normal file
468
vllm/transformers_utils/processors/ovis2_5.py
Normal file
@@ -0,0 +1,468 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
from functools import cached_property
|
||||
|
||||
import numpy as np
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
__all__ = ["Ovis2_5Processor"]
|
||||
IMAGE_TOKEN = "<image>"
|
||||
VIDEO_TOKEN = "<video>"
|
||||
MIN_PIXELS = 448 * 448
|
||||
MAX_PIXELS = 1792 * 1792
|
||||
|
||||
|
||||
class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
"convert_to_rgb": True,
|
||||
"min_pixels": MIN_PIXELS,
|
||||
"max_pixels": MAX_PIXELS,
|
||||
},
|
||||
"videos_kwargs": {
|
||||
"convert_to_rgb": True,
|
||||
"min_pixels": MIN_PIXELS,
|
||||
"max_pixels": MAX_PIXELS,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class Ovis2_5Processor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor
|
||||
and a Qwen2 tokenizer into a single processor.
|
||||
[`OvisProcessor`] offers all the functionalities of
|
||||
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
|
||||
See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
|
||||
for more information.
|
||||
Args:
|
||||
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
chat_template (`str`, *optional*): A Jinja template which will
|
||||
be used to convert lists of messages in a chat into
|
||||
a tokenizable string.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
chat_template=None,
|
||||
image_pad_token=None,
|
||||
patch_size=16,
|
||||
hidden_stride=2,
|
||||
temporal_patch_size=1,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_token = IMAGE_TOKEN
|
||||
self.video_token = VIDEO_TOKEN
|
||||
self.image_pad_token = "<|image_pad|>"
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.hidden_stride = hidden_stride
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||
|
||||
@cached_property
|
||||
def extra_special_tokens(self):
|
||||
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
|
||||
extra_special_tokens = {
|
||||
"image_token": -200,
|
||||
"video_token": -201,
|
||||
"visual_atom": -300,
|
||||
"image_start": -301,
|
||||
"image_end": -302,
|
||||
"video_start": -303,
|
||||
"video_end": -304,
|
||||
"image_pad": image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
videos: np.ndarray | list[ImageInput] = None,
|
||||
text: TextInput
|
||||
| PreTokenizedInput
|
||||
| list[TextInput]
|
||||
| list[PreTokenizedInput] = None,
|
||||
**kwargs: Unpack[Ovis2_5ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s)
|
||||
and image(s). This method forwards the `text`and `kwargs` arguments
|
||||
to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
|
||||
is not `None` to encode the text. To prepare the vision inputs,
|
||||
this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
|
||||
if `vision_infos` is not `None`.
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
|
||||
`list[PIL.Image.Image]`, `list[np.ndarray]`,
|
||||
`list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared.
|
||||
Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats
|
||||
are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded.
|
||||
Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as
|
||||
list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with
|
||||
a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
|
||||
`list[torch.Tensor]`):
|
||||
The image or batch of videos to be prepared. Each video
|
||||
can be a 4D NumPy array or PyTorch tensor, or a nested
|
||||
list of 3D frames. Both channels-first and channels-last
|
||||
formats are supported.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework.
|
||||
Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
- **input_ids** -- list of token ids to be fed to a model.
|
||||
Returned when `text` is not `None`.
|
||||
- **attention_mask** -- list of indices specifying which tokens
|
||||
should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"*
|
||||
is in `self.model_input_names` and if `text` is not `None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model.
|
||||
Returned when `images` is not `None`.
|
||||
- **pixel_values_videos** -- Pixel values of videos to be fed to
|
||||
a model. Returned when `videos` is not `None`.
|
||||
- **image_grid_thw** -- list of image 3D grid in LLM. Returned
|
||||
when `images` is not `None`.
|
||||
- **video_grid_thw** -- list of video 3D grid in LLM. Returned
|
||||
when `videos` is not `None`.
|
||||
- **second_per_grid_ts** -- list of video seconds per time grid.
|
||||
Returned when `videos` is not `None`.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
Ovis2_5ProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
# Process all images first
|
||||
visual_features = {}
|
||||
output = BatchFeature()
|
||||
if images is not None:
|
||||
processed_images = []
|
||||
image_placeholders_list = []
|
||||
grids = []
|
||||
# Process each image
|
||||
for image in images if isinstance(images, list) else [images]:
|
||||
pixel_values, image_placeholders, grid = self.preprocess_multidata(
|
||||
images=image, **output_kwargs["images_kwargs"]
|
||||
)
|
||||
processed_images.append(pixel_values)
|
||||
image_placeholders_list.append(image_placeholders)
|
||||
grids.append(grid)
|
||||
|
||||
# assign all processed images
|
||||
if processed_images:
|
||||
visual_features["image_placeholders"] = image_placeholders_list
|
||||
output["pixel_values"] = processed_images
|
||||
output["grids"] = grids
|
||||
|
||||
if videos is not None:
|
||||
processed_videos = []
|
||||
videos_placeholders_list = []
|
||||
grids = []
|
||||
# Process each video
|
||||
for video in videos if isinstance(videos, list) else [videos]:
|
||||
pixel_values, video_placeholders, grid = self.preprocess_multidata(
|
||||
video=video, **output_kwargs["videos_kwargs"]
|
||||
)
|
||||
processed_videos.append(pixel_values)
|
||||
videos_placeholders_list.append(video_placeholders)
|
||||
grids.append(grid)
|
||||
# assign all processed videos
|
||||
if processed_videos:
|
||||
visual_features["video_placeholders"] = videos_placeholders_list
|
||||
output["video_pixel_values"] = processed_videos
|
||||
output["video_grids"] = grids
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
tokenized_batched_text = self._tokenize_with_visual_symbol(text)
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
video_token_id = self.get_token_value("video_token")
|
||||
replaced_ids_list = []
|
||||
image_idx = 0
|
||||
video_idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
has_image_tokens = (
|
||||
image_token_id in ids_tensor
|
||||
and "image_placeholders" in visual_features
|
||||
and image_idx < len(visual_features["image_placeholders"])
|
||||
)
|
||||
has_video_tokens = (
|
||||
video_token_id in ids_tensor
|
||||
and "video_placeholders" in visual_features
|
||||
and video_idx < len(visual_features["video_placeholders"])
|
||||
)
|
||||
if has_image_tokens or has_video_tokens:
|
||||
# Convert to list for easier manipulation
|
||||
ids_list = ids_tensor.tolist()
|
||||
new_ids = []
|
||||
|
||||
# Replace placeholders
|
||||
for token_id in ids_list:
|
||||
if token_id == image_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["image_placeholders"][image_idx]
|
||||
)
|
||||
image_idx += 1
|
||||
elif token_id == video_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["video_placeholders"][video_idx]
|
||||
)
|
||||
video_idx += 1
|
||||
else:
|
||||
new_ids.append(token_id)
|
||||
# Convert back to tensor
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
if replaced_ids_list:
|
||||
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
|
||||
else:
|
||||
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
|
||||
output["input_ids"] = replaced_and_tokenized_ids
|
||||
|
||||
return output
|
||||
# If only images were provided
|
||||
return BatchFeature(data=visual_features)
|
||||
|
||||
def _tokenize_with_visual_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
token_ids = []
|
||||
video_token_id = self.get_token_value("video_token")
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
video_split_texts = text.split(self.video_token)
|
||||
|
||||
for j, video_segment in enumerate(video_split_texts):
|
||||
image_split_texts = video_segment.split(self.image_token)
|
||||
text_chunks = [
|
||||
self.tokenizer(chunk, add_special_tokens=False).input_ids
|
||||
for chunk in image_split_texts
|
||||
]
|
||||
segment_tokens = []
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
segment_tokens.extend(chunk)
|
||||
if i < len(text_chunks) - 1:
|
||||
segment_tokens.append(image_token_id)
|
||||
token_ids.extend(segment_tokens)
|
||||
if j < len(video_split_texts) - 1:
|
||||
token_ids.append(video_token_id)
|
||||
|
||||
batch_token_ids.append(token_ids)
|
||||
return torch.tensor(batch_token_ids, dtype=torch.long)
|
||||
|
||||
# Copied from qwen2_vl
|
||||
def smart_resize(
|
||||
self,
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
2. The total number of pixels is within the range
|
||||
['min_pixels', 'max_pixels'].
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if height < factor or width < factor:
|
||||
print(
|
||||
f"height:{height} or width:{width} must be larger than factor:{factor}"
|
||||
)
|
||||
if height < width:
|
||||
width = round(factor / height * width)
|
||||
height = factor
|
||||
else:
|
||||
height = round(factor / width * height)
|
||||
width = factor
|
||||
|
||||
elif max(height, width) / min(height, width) > 200:
|
||||
print(
|
||||
f"absolute aspect ratio must be smaller than 200, "
|
||||
f"got {max(height, width) / min(height, width)}"
|
||||
)
|
||||
if height > width:
|
||||
height = 200 * width
|
||||
else:
|
||||
width = 200 * height
|
||||
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = math.floor(height / beta / factor) * factor
|
||||
w_bar = math.floor(width / beta / factor) * factor
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_visual_indicators(self, grid, is_video: bool = False):
|
||||
if is_video:
|
||||
start_token = self.get_token_value("video_start")
|
||||
end_token = self.get_token_value("video_end")
|
||||
else:
|
||||
start_token = self.get_token_value("image_start")
|
||||
end_token = self.get_token_value("image_end")
|
||||
|
||||
image_placeholders = [start_token, self.get_token_value("visual_atom")]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(self.get_token_value("visual_atom"))
|
||||
|
||||
image_placeholders.append(end_token)
|
||||
return image_placeholders
|
||||
|
||||
def construct_visual_placeholders(self, grid, is_video: bool = False):
|
||||
visual_placeholders = self.construct_visual_indicators((1, 1), is_video)
|
||||
|
||||
image_atom_token_id = self.get_token_value("visual_atom")
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value("image_pad")
|
||||
|
||||
num_image_atoms = grid[0] * grid[1] * grid[2]
|
||||
num_image_atoms //= self.hidden_stride**2
|
||||
num_image_atoms //= self.temporal_patch_size
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in visual_placeholders:
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend(
|
||||
[image_padding_token_id] * num_image_atoms
|
||||
)
|
||||
else:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_multidata(
|
||||
self,
|
||||
images: PIL.Image.Image | list[PIL.Image.Image] | None = None,
|
||||
video: list[PIL.Image.Image] | np.ndarray | None = None,
|
||||
convert_to_rgb: bool | None = True,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
return_tensors: str | None = "pt",
|
||||
):
|
||||
is_video = False
|
||||
if images is not None:
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
elif video is not None:
|
||||
is_video = True
|
||||
# type of vidoe in dummy_mm_data is np.ndarray
|
||||
if isinstance(video, np.ndarray):
|
||||
images = []
|
||||
for i in range(video.shape[0]):
|
||||
image = PIL.Image.fromarray(video[i].astype(np.uint8))
|
||||
images.append(image)
|
||||
elif isinstance(video, list):
|
||||
images = video
|
||||
else:
|
||||
raise ValueError("Either images or video should be provided.")
|
||||
min_pixels = min(
|
||||
max_pixels if max_pixels is not None else MAX_PIXELS,
|
||||
min_pixels if min_pixels is not None else MIN_PIXELS,
|
||||
)
|
||||
images = [
|
||||
image.convert("RGB") if convert_to_rgb and image.mode != "RGB" else image
|
||||
for image in images
|
||||
]
|
||||
|
||||
width, height = images[0].size
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
for image in images:
|
||||
resized_height, resized_width = self.smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.hidden_stride,
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
new_size = dict(height=resized_height, width=resized_width)
|
||||
image_pt = self.image_processor.preprocess(
|
||||
image, size=new_size, return_tensors="np"
|
||||
)["pixel_values"][0]
|
||||
|
||||
processed_images.append(image_pt)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
num_to_pad = self.temporal_patch_size - (
|
||||
patches.shape[0] % self.temporal_patch_size
|
||||
)
|
||||
repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||
grid_h = resized_height // self.patch_size
|
||||
grid_w = resized_width // self.patch_size
|
||||
|
||||
patches = patches.reshape(
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h // self.hidden_stride,
|
||||
self.hidden_stride,
|
||||
self.patch_size,
|
||||
grid_w // self.hidden_stride,
|
||||
self.hidden_stride,
|
||||
self.patch_size,
|
||||
)
|
||||
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||
flatten_patches = patches.reshape(
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||
)
|
||||
|
||||
visual_placeholders = self.construct_visual_placeholders(
|
||||
[grid_t, grid_h, grid_w], is_video
|
||||
)
|
||||
return (
|
||||
torch.tensor(flatten_patches),
|
||||
visual_placeholders,
|
||||
torch.tensor([[grid_t, grid_h, grid_w]]),
|
||||
)
|
||||
|
||||
|
||||
AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
|
||||
Reference in New Issue
Block a user