Add minimal vLLM 0.16.1 build repo for BI-V150

This commit is contained in:
2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Multi-modal processors may be defined in this directory for the following
reasons:
- There is no processing file defined by HF Hub or Transformers library.
- There is a need to override the existing processor to support vLLM.
"""
from vllm.transformers_utils.processors.bagel import BagelProcessor
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.funasr_processor import FunASRProcessor
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
from vllm.transformers_utils.processors.ovis import OvisProcessor
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
__all__ = [
"BagelProcessor",
"DeepseekVLV2Processor",
"FunASRProcessor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",
"OvisProcessor",
"Ovis2_5Processor",
]

View File

@@ -0,0 +1,84 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
"""BAGEL processor for image and text inputs."""
from transformers import AutoProcessor
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
class BagelProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"images_kwargs": {
"return_tensors": "pt",
},
}
class BagelProcessor(ProcessorMixin):
"""
Constructs a BAGEL processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "SiglipImageProcessor"
tokenizer_class = "AutoTokenizer"
def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs: Unpack[BagelProcessorKwargs],
):
"""
Main method to prepare for the model one or several sequences(s) and image(s).
"""
output_kwargs = self._merge_kwargs(
BagelProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if images is not None:
# Process images with the image processor
pixel_values = self.image_processor(
images, **output_kwargs["images_kwargs"]
)
else:
pixel_values = {}
text_inputs = (
self.tokenizer(text, **output_kwargs["text_kwargs"])
if text is not None
else {}
)
return BatchFeature(data={**pixel_values, **text_inputs})
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's decode.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
AutoProcessor.register("BagelProcessor", BagelProcessor)

View File

@@ -0,0 +1,458 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.py
# and https://github.com/deepseek-ai/DeepSeek-OCR-2/blob/main/DeepSeek-OCR2-master/DeepSeek-OCR2-vllm/process/image_process.py
import math
from typing import Literal
import torch
import torchvision.transforms as T
from PIL import Image, ImageOps
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
from transformers.processing_utils import ProcessorMixin
# TODO(Isotr0py): change modes for variants
# see: https://github.com/deepseek-ai/DeepSeek-OCR/blob/8cf003d38821fa1b19c73da3bd1b0dc262ea8136/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py#L1-L6
# Tiny: base_size = 512, image_size = 512, crop_mode = False
# Small: base_size = 640, image_size = 640, crop_mode = False
# Base: base_size = 1024, image_size = 1024, crop_mode = False
# Large: base_size = 1280, image_size = 1280, crop_mode = False
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
BASE_SIZE = 1024
IMAGE_SIZE = 640
CROP_MODE = True
# TODO(Isotr0py): Expose as mm_kwargs
MIN_CROPS = 2
MAX_CROPS = 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def calculate_aspect_ratios(
min_num: int = MIN_CROPS, max_num: int = MAX_CROPS
) -> list[tuple[int, int]]:
target_ratios: set[tuple[int, int]] = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
sorted_target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
return sorted_target_ratios
def count_tiles(
orig_width,
orig_height,
min_num=MIN_CROPS,
max_num=MAX_CROPS,
image_size=640,
use_thumbnail=False,
):
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = calculate_aspect_ratios(min_num, max_num)
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
return target_aspect_ratio
def dynamic_preprocess(
image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = calculate_aspect_ratios(min_num, max_num)
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
class ImageTransform:
def __init__(
self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
):
self.mean = mean
self.std = std
self.normalize = normalize
transform_pipelines = [T.ToTensor()]
if normalize:
transform_pipelines.append(T.Normalize(mean, std))
self.transform = T.Compose(transform_pipelines)
def __call__(self, pil_img: Image.Image):
x = self.transform(pil_img)
return x
class DeepseekOCRProcessor(ProcessorMixin):
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
attributes = ["tokenizer"]
def __init__(
self,
tokenizer: LlamaTokenizerFast,
patch_size: int = 16,
downsample_ratio: int = 4,
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
image_token: str = "<image>",
pad_token: str = "<▁pad▁>",
add_special_token: bool = False,
sft_format: str = "deepseek",
mask_prompt: bool = True,
ignore_id: int = -100,
image_size: int = IMAGE_SIZE,
base_size: int = BASE_SIZE,
strategy: Literal["v1", "v2"] = "v1",
**kwargs,
):
self.image_size = image_size
self.base_size = base_size
# image token calculation strategy for
# Deepseek-OCR and Deepseek-OCR-2
self.strategy = strategy
assert strategy in ["v1", "v2"], "Only 'v1' and 'v2' strategies are supported."
self.patch_size = 16
self.image_mean = image_mean
self.image_std = image_std
self.normalize = normalize
self.downsample_ratio = 4
self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer
self.tokenizer.padding_side = "left" # must set thispadding side with make a difference in batch inference # noqa: E501
# add the pad_token as special token to use 'tokenizer.pad_token'
# and 'tokenizer.pad_token_id'
if self.tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token
self.image_token_id = self.tokenizer.vocab.get(image_token)
self.image_token = image_token
self.pad_token = pad_token
self.add_special_token = add_special_token
self.sft_format = sft_format
self.mask_prompt = mask_prompt
self.ignore_id = ignore_id
super().__init__(
tokenizer,
**kwargs,
)
@property
def bos_id(self):
return self.tokenizer.bos_token_id
@property
def eos_id(self):
return self.tokenizer.eos_token_id
@property
def pad_id(self):
return self.tokenizer.pad_token_id
def encode(self, text: str, bos: bool = True, eos: bool = False):
t = self.tokenizer.encode(text, add_special_tokens=False)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: list[int], **kwargs) -> str:
return self.tokenizer.decode(t, **kwargs)
def process_one(
self,
prompt: str,
images: list[Image.Image],
crop_mode: bool = CROP_MODE,
):
"""
Args:
prompt (str): the formatted prompt;
images (List[ImageType]): the list of images;
crop_mode (bool): if True, then crop the image;
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- target_ids (torch.LongTensor): [N + image tokens]
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (List[int]): the number of image tokens
"""
assert prompt is not None and images is not None, (
"prompt and images must be used at the same time."
)
sft_format = prompt
(
input_ids,
pixel_values,
images_crop,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
_,
) = self.tokenize_with_images(
conversation=sft_format,
images=images,
bos=True,
eos=True,
cropping=crop_mode,
)
prepare = BatchFeature(
data=dict(
input_ids=input_ids,
pixel_values=pixel_values,
images_crop=images_crop,
images_seq_mask=images_seq_mask,
images_spatial_crop=images_spatial_crop,
num_image_tokens=num_image_tokens,
),
tensor_type="pt",
)
return prepare
def __call__(
self,
*,
prompt: str,
images: list[Image.Image],
crop_mode: bool = CROP_MODE,
**kwargs,
):
prepare = self.process_one(
prompt=prompt,
images=images,
crop_mode=crop_mode,
)
return prepare
def tokenize_with_images(
self,
conversation: str,
images: list[Image.Image],
bos: bool = True,
eos: bool = True,
cropping: bool = True,
):
"""Tokenize text with <image> tags."""
assert conversation.count(self.image_token) == len(images)
text_splits = conversation.split(self.image_token)
images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
[],
[],
[],
[],
)
image_shapes = []
num_image_tokens = []
tokenized_str = []
for text_sep, image in zip(text_splits, images):
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
image_shapes.append(image.size)
images_crop_raw = []
if image.size[0] <= self.image_size and image.size[1] <= self.image_size:
crop_ratio = [1, 1]
elif cropping:
images_crop_raw, crop_ratio = dynamic_preprocess(
image, image_size=self.image_size
)
else:
crop_ratio = [1, 1]
if not cropping:
image = image.resize((self.image_size, self.image_size))
global_view = ImageOps.pad(
image,
(self.base_size, self.base_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view))
num_width_tiles, num_height_tiles = crop_ratio
images_spatial_crop.append([num_width_tiles, num_height_tiles])
if num_width_tiles > 1 or num_height_tiles > 1:
for cropped_image in images_crop_raw:
images_crop_list.append(self.image_transform(cropped_image))
num_queries = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
num_queries_base = math.ceil(
(self.base_size // self.patch_size) / self.downsample_ratio
)
num_tokens_base = (
(num_queries_base * (num_queries_base + 1))
if self.strategy == "v1"
else num_queries_base * num_queries_base
)
tokenized_image = [self.image_token_id] * num_tokens_base
tokenized_image += [self.image_token_id]
if num_width_tiles > 1 or num_height_tiles > 1:
num_tokens_per_row = (
num_queries * num_width_tiles + 1
if self.strategy == "v1"
else num_queries * num_width_tiles
)
local_row = [self.image_token_id] * num_tokens_per_row
tokenized_image += local_row * (num_queries * num_height_tiles)
tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image)
num_image_tokens.append(len(tokenized_image))
"""process the last text split"""
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""add the bos and eos tokens"""
if bos:
tokenized_str = [self.bos_id] + tokenized_str
images_seq_mask = [False] + images_seq_mask
if eos:
tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len(images_seq_mask), (
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} "
f"is not equal to images_seq_mask's length {len(images_seq_mask)}."
)
masked_tokenized_str = []
for token_index in tokenized_str:
if token_index != self.image_token_id:
masked_tokenized_str.append(token_index)
else:
masked_tokenized_str.append(self.ignore_id)
assert (
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
), (
f"tokenized_str's length {len(tokenized_str)}, "
f"input_ids' length {len(masked_tokenized_str)}, "
f"images_seq_mask's length {len(images_seq_mask)}, are not equal."
)
input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id
# Remove the ending eos token
assert input_ids[-1] == self.eos_id
input_ids = input_ids[:-1]
target_ids = target_ids[:-1]
images_seq_mask = images_seq_mask[:-1]
if len(images_list) == 0:
pixel_values = torch.zeros((0, 3, self.base_size, self.base_size))
images_spatial_crop = torch.zeros((0, 2), dtype=torch.long)
images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
else:
pixel_values = torch.stack(images_list, dim=0)
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
if images_crop_list:
images_crop = torch.stack(images_crop_list, dim=0)
else:
images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
input_ids = input_ids.unsqueeze(0)
return (
input_ids,
pixel_values,
images_crop,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
image_shapes,
)
AutoProcessor.register("DeepseekOCRProcessor", DeepseekOCRProcessor)

View File

@@ -0,0 +1,406 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
# Copyright (c) 2023-2024 DeepSeek.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import math
from typing import Any
import torch
import torchvision.transforms as T
from PIL import Image, ImageOps
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
from transformers.processing_utils import ProcessorMixin
class ImageTransform:
def __init__(
self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
):
self.mean = mean
self.std = std
self.normalize = normalize
transform_pipelines = [T.ToTensor()]
if normalize:
transform_pipelines.append(T.Normalize(mean, std))
self.transform = T.Compose(transform_pipelines)
def __call__(self, pil_img: Image.Image):
x = self.transform(pil_img)
return x
class DeepseekVLV2Processor(ProcessorMixin):
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
attributes = ["tokenizer"]
def __init__(
self,
tokenizer: LlamaTokenizerFast,
candidate_resolutions: tuple[tuple[int, int]],
patch_size: int,
downsample_ratio: int,
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
image_token: str = "<image>",
pad_token: str = "<▁pad▁>",
add_special_token: bool = False,
sft_format: str = "deepseek",
mask_prompt: bool = True,
ignore_id: int = -100,
**kwargs,
):
self.candidate_resolutions = candidate_resolutions
self.image_size = candidate_resolutions[0][0]
self.patch_size = patch_size
self.image_mean = image_mean
self.image_std = image_std
self.normalize = normalize
self.downsample_ratio = downsample_ratio
self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer
self.tokenizer.padding_side = "left" # must set thispadding side with make a difference in batch inference
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
if tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token
image_token_id = self.tokenizer.vocab.get(image_token)
if image_token_id is None:
special_tokens = [image_token]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
self.image_token_id = self.tokenizer.vocab.get(image_token)
# add five special tokens for grounding-related tasks
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
# add special tokens for SFT data
special_tokens = ["<|User|>", "<|Assistant|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
self.image_token = image_token
self.pad_token = pad_token
self.add_special_token = add_special_token
self.sft_format = sft_format
self.mask_prompt = mask_prompt
self.ignore_id = ignore_id
super().__init__(
tokenizer,
**kwargs,
)
def select_best_resolution(self, image_size):
# used for cropping
original_width, original_height = image_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for width, height in self.candidate_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = (
int(original_width * scale),
int(original_height * scale),
)
effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height
)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (width, height)
return best_fit
@property
def bos_id(self):
return self.tokenizer.bos_token_id
@property
def eos_id(self):
return self.tokenizer.eos_token_id
@property
def pad_id(self):
return self.tokenizer.pad_token_id
def encode(self, text: str, bos: bool = True, eos: bool = False):
t = self.tokenizer.encode(text, add_special_tokens=False)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: list[int], **kwargs) -> str:
return self.tokenizer.decode(t, **kwargs)
def process_one(
self,
prompt: str,
images: list[Image.Image],
inference_mode: bool = True,
**kwargs: Any,
):
"""
Args:
prompt (str): the formatted prompt;
images (list[ImageType]): the list of images;
inference_mode (bool): if True, then remove the last eos token;
**kwargs: Additional keyword arguments.
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- target_ids (torch.LongTensor): [N + image tokens]
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (list[int]): the number of image tokens
"""
assert prompt is not None and images is not None, (
"prompt and images must be used at the same time."
)
sft_format = prompt
(
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
) = self.tokenize_with_images(
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
)
masked_tokenized_str = []
for token_index in tokenized_str:
if token_index != self.image_token_id:
masked_tokenized_str.append(token_index)
else:
masked_tokenized_str.append(self.ignore_id)
assert (
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
), (
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
)
input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id
if inference_mode:
# Remove the ending eos token
assert input_ids[-1] == self.eos_id
input_ids = input_ids[:-1]
target_ids = target_ids[:-1]
images_seq_mask = images_seq_mask[:-1]
if len(images_list) == 0:
pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
else:
pixel_values = torch.stack(images_list, dim=0)
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
input_ids = input_ids.unsqueeze(0)
prepare = BatchFeature(
data=dict(
input_ids=input_ids,
pixel_values=pixel_values,
images_seq_mask=images_seq_mask,
images_spatial_crop=images_spatial_crop,
num_image_tokens=num_image_tokens,
),
tensor_type="pt",
)
return prepare
def __call__(
self,
*,
text: str,
images: list[Image.Image],
inference_mode: bool = True,
**kwargs: Any,
):
"""
Args:
text (str): the formatted prompt;
images (list[ImageType]): the list of images;
inference_mode (bool): if True, then remove the last eos token;
**kwargs:
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- images (torch.FloatTensor): [n_images, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (list[int]): the number of image tokens
"""
prepare = self.process_one(
prompt=text,
images=images,
inference_mode=inference_mode,
)
return prepare
def tokenize_with_images(
self,
conversation: str,
images: list[Image.Image],
bos: bool = True,
eos: bool = True,
cropping: bool = True,
):
"""Tokenize text with <image> tags."""
assert conversation.count(self.image_token) == len(images)
text_splits = conversation.split(self.image_token)
images_list, images_seq_mask, images_spatial_crop = [], [], []
num_image_tokens = []
tokenized_str = []
for text_sep, image in zip(text_splits, images):
"""encode text_sep"""
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""select best resolution for anyres"""
if cropping:
best_width, best_height = self.select_best_resolution(image.size)
else:
best_width, best_height = self.image_size, self.image_size
"""process the global view"""
global_view = ImageOps.pad(
image,
(self.image_size, self.image_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view))
"""process the local views"""
local_view = ImageOps.pad(
image,
(best_width, best_height),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
for i in range(0, best_height, self.image_size):
for j in range(0, best_width, self.image_size):
images_list.append(
self.image_transform(
local_view.crop(
(j, i, j + self.image_size, i + self.image_size)
)
)
)
"""record height / width crop num"""
num_width_tiles, num_height_tiles = (
best_width // self.image_size,
best_height // self.image_size,
)
images_spatial_crop.append([num_width_tiles, num_height_tiles])
"""add image tokens"""
h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1)
# add a separator between global and local views
tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += (
[self.image_token_id]
* (num_height_tiles * h)
* (num_width_tiles * w + 1)
)
tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image)
num_image_tokens.append(len(tokenized_image))
"""process the last text split"""
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""add the bos and eos tokens"""
if bos:
tokenized_str = [self.bos_id] + tokenized_str
images_seq_mask = [False] + images_seq_mask
if eos:
tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len(images_seq_mask), (
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
)
return (
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
)
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)

View File

@@ -0,0 +1,504 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import torch
import torch.nn as nn
import torchaudio.compliance.kaldi as kaldi
from torch.nn.utils.rnn import pad_sequence
from transformers import (
AutoFeatureExtractor,
AutoProcessor,
BatchFeature,
)
from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
from transformers.processing_utils import ProcessorMixin
from transformers.utils import TensorType
from vllm.logger import init_logger
logger = init_logger(__name__)
def apply_cmvn(inputs, cmvn): # noqa
"""
Apply CMVN with mvn data
"""
device = inputs.device
# dtype = inputs.dtype
frame, dim = inputs.shape
means = cmvn[0:1, :dim]
vars = cmvn[1:2, :dim]
inputs += means.to(device)
inputs *= vars.to(device)
return inputs.type(torch.float32)
def apply_lfr(inputs, lfr_m, lfr_n):
# LFR_inputs = []
T = inputs.shape[0]
T_lfr = int(np.ceil(T / lfr_n))
left_padding = inputs[0].repeat((lfr_m - 1) // 2, 1)
inputs = torch.vstack((left_padding, inputs))
T = T + (lfr_m - 1) // 2
feat_dim = inputs.shape[-1]
strides = (lfr_n * feat_dim, 1)
sizes = (T_lfr, lfr_m * feat_dim)
last_idx = (T - lfr_m) // lfr_n + 1
num_padding = lfr_m - (T - last_idx * lfr_n)
if num_padding > 0:
num_padding = (
(2 * lfr_m - 2 * T + (T_lfr - 1 + last_idx) * lfr_n)
/ 2
* (T_lfr - last_idx)
)
inputs = torch.vstack([inputs] + [inputs[-1:]] * int(num_padding))
LFR_outputs = inputs.as_strided(sizes, strides)
return LFR_outputs.clone().type(torch.float32)
def load_cmvn(cmvn_file):
with open(cmvn_file, encoding="utf-8") as f:
lines = f.readlines()
means_list = []
vars_list = []
for i in range(len(lines)):
line_item = lines[i].split()
if line_item[0] == "<AddShift>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
add_shift_line = line_item[3 : (len(line_item) - 1)]
means_list = list(add_shift_line)
continue
elif line_item[0] == "<Rescale>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
rescale_line = line_item[3 : (len(line_item) - 1)]
vars_list = list(rescale_line)
continue
means = np.array(means_list).astype(np.float32)
vars = np.array(vars_list).astype(np.float32)
cmvn = np.array([means, vars])
cmvn = torch.as_tensor(cmvn, dtype=torch.float32)
return cmvn
class WavFrontend(nn.Module):
"""Conventional frontend structure for ASR."""
def __init__(
self,
cmvn_file: str = "null",
fs: int = 16000,
window: str = "hamming",
n_mels: int = 80,
frame_length: int = 25,
frame_shift: int = 10,
filter_length_min: int = -1,
filter_length_max: int = -1,
lfr_m: int = 1,
lfr_n: int = 1,
dither: float = 1.0,
snip_edges: bool = True,
upsacle_samples: bool = True,
**kwargs,
):
super().__init__()
self.fs = fs
self.window = window
self.n_mels = n_mels
self.frame_length = frame_length
self.frame_shift = frame_shift
self.filter_length_min = filter_length_min
self.filter_length_max = filter_length_max
self.lfr_m = lfr_m
self.lfr_n = lfr_n
self.cmvn_file = cmvn_file
self.dither = dither
self.snip_edges = snip_edges
self.upsacle_samples = upsacle_samples
self.cmvn = None if self.cmvn_file is None else load_cmvn(self.cmvn_file)
def output_size(self) -> int:
return self.n_mels * self.lfr_m
def forward(
self,
input: torch.Tensor,
input_lengths,
**kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
batch_size = input.size(0)
feats = []
feats_lens = []
for i in range(batch_size):
waveform_length = input_lengths[i]
waveform = input[i][:waveform_length]
if self.upsacle_samples:
waveform = waveform * (1 << 15)
waveform = waveform.unsqueeze(0)
mat = kaldi.fbank(
waveform,
num_mel_bins=self.n_mels,
frame_length=min(self.frame_length, waveform_length / self.fs * 1000),
frame_shift=self.frame_shift,
dither=self.dither,
energy_floor=0.0,
window_type=self.window,
sample_frequency=self.fs,
snip_edges=self.snip_edges,
)
if self.lfr_m != 1 or self.lfr_n != 1:
mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
if self.cmvn is not None:
mat = apply_cmvn(mat, self.cmvn)
feat_length = mat.size(0)
feats.append(mat)
feats_lens.append(feat_length)
feats_lens = torch.as_tensor(feats_lens)
if batch_size == 1:
feats_pad = feats[0][None, :, :]
else:
feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
return feats_pad, feats_lens
def forward_fbank(
self, input: torch.Tensor, input_lengths: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor]:
batch_size = input.size(0)
feats = []
feats_lens = []
for i in range(batch_size):
waveform_length = input_lengths[i]
waveform = input[i][:waveform_length]
waveform = waveform * (1 << 15)
waveform = waveform.unsqueeze(0)
mat = kaldi.fbank(
waveform,
num_mel_bins=self.n_mels,
frame_length=self.frame_length,
frame_shift=self.frame_shift,
dither=self.dither,
energy_floor=0.0,
window_type=self.window,
sample_frequency=self.fs,
)
feat_length = mat.size(0)
feats.append(mat)
feats_lens.append(feat_length)
feats_lens = torch.as_tensor(feats_lens)
feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
return feats_pad, feats_lens
def forward_lfr_cmvn(
self, input: torch.Tensor, input_lengths: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor]:
batch_size = input.size(0)
feats = []
feats_lens = []
for i in range(batch_size):
mat = input[i, : input_lengths[i], :]
if self.lfr_m != 1 or self.lfr_n != 1:
mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
if self.cmvn is not None:
mat = apply_cmvn(mat, self.cmvn)
feat_length = mat.size(0)
feats.append(mat)
feats_lens.append(feat_length)
feats_lens = torch.as_tensor(feats_lens)
feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0)
return feats_pad, feats_lens
class FunASRFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a FunASR feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_
utils.SequenceFeatureExtractor`] which contains most of the main
methods. Users should refer to this superclass for more information
regarding those methods.
This class extracts mel-filter bank features from raw speech using a custom
numpy implementation of the `Short Time Fourier Transform` which should
match pytorch's `torch.stft` equivalent.
Args:
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized
expressed in hertz (Hz).
hop_length (`int`, *optional*, defaults to 160):
Length of the overlapping windows for the STFT used to obtain the
Mel Frequency coefficients.
chunk_length (`int`, *optional*, defaults to 30):
The maximum number of chunks of `sampling_rate` samples used to
trim and pad longer or shorter audio sequences.
n_fft (`int`, *optional*, defaults to 400):
Size of the Fourier transform.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
dither (`float`, *optional*, defaults to 0.0):
Adds dithering. In other words, adds a small Gaussian noise to each frame.
E.g. use 0.0001 to add dithering with a normal distribution centered
around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range
of raw_speech). The value 0.0 means no dithering.
Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
the high log_mel_fbank values for signals with hard-zero sections,
when VAD cutoff is present in the signal.
"""
model_input_names = ["input_features"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
hop_length=160,
chunk_length=30,
n_fft=400,
padding_value=0.0,
dither=0.0,
return_attention_mask=False,
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
self.frontend_conf = kwargs.get("frontend_conf", {})
self.n_fft = n_fft
self.hop_length = hop_length
self.chunk_length = chunk_length
self.n_samples = chunk_length * sampling_rate
self.nb_max_frames = self.n_samples // hop_length
self.sampling_rate = sampling_rate
self.dither = dither
def extract_fbank(
self, data, data_len=None, data_type: str = "sound", frontend=None, **kwargs
):
if isinstance(data, np.ndarray):
data = torch.from_numpy(data)
if len(data.shape) < 2:
data = data[None, :] # data: [batch, N]
data_len = [data.shape[1]] if data_len is None else data_len
elif isinstance(data, torch.Tensor):
if len(data.shape) < 2:
data = data[None, :] # data: [batch, N]
data_len = [data.shape[1]] if data_len is None else data_len
elif isinstance(data, (list, tuple)):
data_list, data_len = [], []
for data_i in data:
if isinstance(data_i, np.ndarray):
data_i = torch.from_numpy(data_i)
data_list.append(data_i)
data_len.append(data_i.shape[0])
data = pad_sequence(data_list, batch_first=True)
data, data_len = frontend(data, data_len, **kwargs)
if isinstance(data_len, (list, tuple)):
data_len = torch.tensor([data_len])
return data.to(torch.float32), data_len.to(torch.int32)
def __call__(
self,
raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
truncation: bool = True,
pad_to_multiple_of: int | None = None,
return_tensors: str | TensorType | None = None,
return_attention_mask: bool | None = None,
padding: str | None = "max_length",
max_length: int | None = None,
sampling_rate: int | None = None,
do_normalize: bool | None = None,
device: str | None = "cpu",
return_token_timestamps: bool | None = None,
**kwargs,
) -> BatchFeature:
is_batched = isinstance(raw_speech, (list, tuple)) and (
isinstance(raw_speech[0], (np.ndarray, tuple, list))
)
if is_batched:
raw_speech = [
np.asarray([speech], dtype=np.float32).T for speech in raw_speech
]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech, dtype=np.float32)
elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(
np.float64
):
raw_speech = raw_speech.astype(np.float32)
if not is_batched:
raw_speech = [np.asarray([raw_speech]).T]
batched_speech = BatchFeature({"input_features": raw_speech})
padded_inputs = self.pad(
batched_speech,
padding=padding,
max_length=max_length if max_length else self.n_samples,
truncation=truncation,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask or do_normalize,
)
input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
frontend = WavFrontend(**self.frontend_conf, dither=self.dither)
input_features, speech_lengths = self.extract_fbank(
input_features[0],
data_type=kwargs.get("data_type", "sound"),
frontend=frontend,
is_final=True,
)
olens = 1 + (speech_lengths - 3 + 2 * 1) // 2
olens = 1 + (olens - 3 + 2 * 1) // 2
fake_token_len = (olens - 1) // 2 + 1
if isinstance(input_features[0], list):
padded_inputs["input_features"] = [
np.asarray(feature, dtype=np.float32) for feature in input_features
]
else:
padded_inputs["input_features"] = input_features
if return_tensors is not None:
padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
padded_inputs["speech_lengths"] = speech_lengths
padded_inputs["fake_token_len"] = fake_token_len
return padded_inputs
class FunASRProcessor(ProcessorMixin):
r"""
Constructs a FunASR processor which wraps a FunASR feature extractor and
a FunASR tokenizer into a single processor.
[`FunASRProcessor`] offers all the functionalities of
[`FunASRFeatureExtractor`] and [`Qwen2Tokenizer`]. See the
[`~FunASRProcessor.__call__`] and [`~FunASRProcessor.decode`] for more
information.
Args:
feature_extractor (`FunASRFeatureExtractor`): An instance of
[`FunASRFeatureExtractor`].
The feature extractor is a required input.
tokenizer (`Qwen2Tokenizer`):
An instance of [`Qwen2Tokenizer`]. The tokenizer is a required
input.
"""
feature_extractor_class = "FunASRFeatureExtractor"
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
def __init__(
self,
feature_extractor,
tokenizer,
audio_token="<|AUDIO|>",
):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
self.audio_token = (
tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
)
self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
return self.tokenizer.get_decoder_prompt_ids(
task=task, language=language, no_timestamps=no_timestamps
)
def __call__(self, *args, **kwargs):
"""
Forwards the `audio` argument to FunASRFeatureExtractor's
[`~FunASRFeatureExtractor.__call__`] and the `text` argument to
[`~Qwen2Tokenizer.__call__`]. Please refer to the docstring of the
above two methods for more information.
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if text is None:
raise ValueError("You need to specify `text` input to process.")
elif isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError(
"Invalid input text. Please provide a string, or a list of strings"
)
if audio is not None:
# ensure we have as much audios as audio tokens
num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
num_audios = 1 if type(audio) is np.ndarray else len(audio)
if num_audio_tokens != num_audios:
raise ValueError(
f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}" # noqa: E501
)
inputs = self.feature_extractor(
audio, *args, sampling_rate=sampling_rate, **kwargs
)
expanded_text = []
for sample in text:
replace_str = []
while self.audio_token in sample:
num_audio_tokens = inputs["fake_token_len"].item()
expanded_audio_token = self.audio_token * num_audio_tokens
replace_str.append(expanded_audio_token)
sample = sample.replace(self.audio_token, "<placeholder>", 1)
while "<placeholder>" in sample:
sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
expanded_text.append(sample)
text = expanded_text
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def get_prompt_ids(self, text: str, return_tensors="np"):
return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors)
AutoFeatureExtractor.register("FunASRFeatureExtractor", FunASRFeatureExtractor)
AutoProcessor.register("FunASRProcessor", FunASRProcessor)

View File

@@ -0,0 +1,230 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
import numpy as np
import torch
from transformers import AutoProcessor
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers.video_utils import VideoInput
class HunYuanVLProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer" # ("AutoTokenizer", None)
def __init__(
self,
image_processor=None,
tokenizer=None,
chat_template=None,
**kwargs,
):
# TODO Fix the init
self.tokenizer = tokenizer
self.image_token_id = 120120 # self.tokenizer.image_token_id
self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
self.im_start_token_id = 120118 # self.tokenizer.im_start_id
self.im_start_token = self.tokenizer.convert_ids_to_tokens(
self.im_start_token_id
)
self.im_end_token_id = 120119 # self.tokenizer.im_end_id
self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
self.tokenizer.vocab_size - 1
)
self.pad_id = 120002 # self.tokenizer.pad_token_id
super().__init__(image_processor, tokenizer, chat_template=chat_template)
def __call__(
self,
images: ImageInput = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
videos: VideoInput = None,
**kwargs,
) -> BatchFeature:
image_inputs = {}
if images is not None:
image_inputs = self.image_processor(images=images)
image_grid_thw = image_inputs["image_grid_thw"]
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
image_tokens_cumsum = [0]
if images is not None:
index = 0
for i in range(len(text)):
while self.image_token in text[i]:
grid_h, grid_w = image_grid_thw[index][-2:]
patch_h = grid_h // self.image_processor.merge_size
patch_w = grid_w // self.image_processor.merge_size
num_image_tokens = patch_h * (patch_w + 1) + 2
image_tokens_cumsum.append(
image_tokens_cumsum[-1] + num_image_tokens
)
# text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
text[i] = text[i].replace(
self.image_token, self.placeholder_token * num_image_tokens, 1
)
index += 1
text[i] = text[i].replace(self.placeholder_token, self.image_token)
# text[i] = self.tokenizer.bos_token + text[i]
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
input_ids = text_inputs["input_ids"]
position_ids = torch.arange(len(input_ids[0]))
position_ids_w = torch.arange(len(input_ids[0]))
position_ids_h = torch.arange(len(input_ids[0]))
position_ids_t = torch.arange(len(input_ids[0]))
if images is not None:
image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
0
]
for i in range(len(image_grid_thw)):
grid_h, grid_w = image_grid_thw[i][-2:]
patch_h = grid_h // self.image_processor.merge_size
patch_w = grid_w // self.image_processor.merge_size
start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
replace_num = (patch_w + 1) * patch_h
position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
list(range(patch_w + 1)) * patch_h, dtype=torch.int64
)
patch_h_list = []
for h in range(patch_h):
patch_h_list += [h] * (patch_w + 1)
position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
patch_h_list, dtype=torch.int64
)
position_ids_t[start_pos : start_pos + replace_num] = 0
position_ids = torch.stack(
[position_ids, position_ids_w, position_ids_h, position_ids_t]
).unsqueeze(0)
text_inputs["position_ids"] = position_ids
attention_mask = input_ids.ne(self.pad_id)
text_inputs["attention_mask"] = attention_mask
text_inputs["imgs_pos"] = [self.get_imgs_pos(e) for e in input_ids]
# image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
return_tensors = kwargs.pop("return_tensors", None)
return BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
def post_process_image_text_to_text(
self,
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
**kwargs,
):
assert 0
def apply_chat_template(self, *args, **kwargs):
kwargs["return_dict"] = False
return self.tokenizer.apply_chat_template(*args, **kwargs)
def get_imgs_pos(self, doc_ids):
doc_ids = np.array(doc_ids, dtype=np.int64)
img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
imgs_pos = np.concatenate(
(
np.reshape(img_begin_index + 1, (-1, 1)),
np.reshape(img_end_index, (-1, 1)),
),
axis=-1,
).tolist()
return imgs_pos
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
def split_image_into_patch_blocks(
pixel_values: torch.Tensor, # shape: [batch_size, 3, H, W]
patch_size: int = 16, # e.g. 16
adaptor_patch_div: int = 4, # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
) -> torch.Tensor:
"""
Split the input image tensor (supporting batch) into large patches of size `patch_size`,
and then further divide each large patch into smaller regions of size
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
The final output contains all such small region tensors.
Args:
pixel_values: Input image tensor of shape [batch_size, 3, H, W].
patch_size: Size of the large patch, e.g., 16.
adaptor_patch_div: Each large patch is divided into
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
smaller regions.
Returns:
patches: A tensor of shape [N, 3, patch_size, patch_size],
where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
Each element in the batch corresponds to one small image region.
""" # noqa: E501
batch_size, channels, height, width = pixel_values.shape
assert channels == 3, "Pixel values must have 3 channels in dim=1"
assert height % patch_size == 0 and width % patch_size == 0, (
"H and W must be divisible by patch_size"
)
patch_height_num = height // patch_size
patch_width_num = width // patch_size
# Reshape to [B, 3, ph, ps, pw, ps]
img = pixel_values.reshape(
batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
)
# Further split each psxps patch into (ps//aps)x(ps//aps) small regions
img = img.reshape(
batch_size,
3,
patch_height_num,
patch_size // adaptor_patch_div, # ps // aps
adaptor_patch_div,
patch_width_num,
patch_size // adaptor_patch_div, # ps // aps
adaptor_patch_div,
)
# Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
# Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
patches = img.reshape(-1, 3, patch_size, patch_size)
return patches
AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)

View File

@@ -0,0 +1,477 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
"""Image processor class for HunYuanVL."""
# isort conflicts with ruff for transformers imports
# isort: skip_file
import math
import numpy as np
import torchvision.transforms as transforms
from transformers import AutoImageProcessor
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_transforms import (
convert_to_rgb,
)
from transformers.image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
make_flat_list_of_images,
make_list_of_images,
valid_images,
validate_preprocess_arguments,
)
from transformers.utils import TensorType, logging
from transformers.video_utils import VideoInput, make_batched_videos
logger = logging.get_logger(__name__)
def smart_resize(
height: int,
width: int,
factor: int = 16,
min_pixels: int = 512 * 512,
max_pixels: int = 2048 * 2048,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > 200:
raise ValueError(
"absolute aspect ratio must be smaller than 200, got "
f"{max(height, width) / min(height, width)}"
)
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = max(factor, math.floor(height / beta / factor) * factor)
w_bar = max(factor, math.floor(width / beta / factor) * factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
class HunYuanVLImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
size: dict[str, int] | None = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: int | float = 1 / 255,
do_normalize: bool = True,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
do_convert_rgb: bool = True,
min_pixels: int | None = None,
max_pixels: int | None = None,
patch_size: int = 16,
temporal_patch_size: int = 2,
merge_size: int = 2,
**kwargs,
) -> None:
super().__init__(**kwargs)
if size is not None and (
"shortest_edge" not in size or "longest_edge" not in size
):
raise ValueError(
"size must contain 'shortest_edge' and 'longest_edge' keys."
)
else:
size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
# backward compatibility: override size with min_pixels and max_pixels
# if they are provided.
if min_pixels is not None:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
size["longest_edge"] = max_pixels
self.min_pixels = size["shortest_edge"]
self.max_pixels = size["longest_edge"]
self.size = size
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.do_convert_rgb = do_convert_rgb
# hard-code
def _preprocess(
self,
images: ImageInput | VideoInput,
do_resize: bool | None = None,
size: dict[str, int] | None = None,
resample: PILImageResampling = None,
do_rescale: bool | None = None,
rescale_factor: float | None = None,
do_normalize: bool | None = None,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
patch_size: int = 16,
temporal_patch_size: int = 2,
merge_size: int = 2,
do_convert_rgb: bool | None = None,
data_format: ChannelDimension | None = ChannelDimension.FIRST,
input_data_format: str | ChannelDimension | None = None,
):
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
Args:
images (`ImageInput`):
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to `self.merge_size`):
The merge size of the vision encoder to llm encoder.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" # noqa: E501
images = make_list_of_images(images)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
width, height = images[0].width, images[0].height
resized_width, resized_height = width, height
processed_images = []
for image in images:
if do_resize:
resized_height, resized_width = smart_resize(
height=height,
width=width,
factor=patch_size * merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.resize((resized_width, resized_height))
if do_normalize:
image = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(self.image_mean, self.image_std),
]
)(image)
processed_images.append(image)
patches = np.array(processed_images)
channel = patches.shape[1]
grid_t = patches.shape[0] // temporal_patch_size
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
patches = patches.reshape(
1,
channel,
grid_h // merge_size,
merge_size,
patch_size,
grid_w // merge_size,
merge_size,
patch_size,
)
patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
flatten_patches = patches.reshape(
1 * grid_h * grid_w, channel * patch_size * patch_size
)
return flatten_patches, (grid_t, grid_h, grid_w)
def preprocess(
self,
images: ImageInput,
videos: VideoInput = None,
do_resize: bool | None = None,
size: dict[str, int] | None = None,
min_pixels: int | None = None,
max_pixels: int | None = None,
resample: PILImageResampling = None,
do_rescale: bool | None = None,
rescale_factor: float | None = None,
do_normalize: bool | None = None,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
patch_size: int | None = None,
temporal_patch_size: int | None = None,
merge_size: int | None = None,
do_convert_rgb: bool | None = None,
return_tensors: str | TensorType | None = None,
data_format: ChannelDimension | None = ChannelDimension.FIRST,
input_data_format: str | ChannelDimension | None = None,
):
"""
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
videos (`VideoInput`):
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to `self.merge_size`):
The merge size of the vision encoder to llm encoder.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" # noqa: E501
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
if size is not None:
if "shortest_edge" not in size or "longest_edge" not in size:
raise ValueError(
"size must contain 'shortest_edge' and 'longest_edge' keys."
)
min_pixels = size["shortest_edge"]
elif min_pixels is not None and max_pixels is not None:
# backward compatibility: override size with min_pixels and max_pixels
# if they are provided.
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
else:
size = {**self.size}
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = (
rescale_factor if rescale_factor is not None else self.rescale_factor
)
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
patch_size = patch_size if patch_size is not None else self.patch_size
temporal_patch_size = (
temporal_patch_size
if temporal_patch_size is not None
else self.temporal_patch_size
)
merge_size = merge_size if merge_size is not None else self.merge_size
do_convert_rgb = (
do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
)
if images is not None:
images = make_flat_list_of_images(images)
if images is not None and not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
validate_preprocess_arguments(
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_resize=do_resize,
size=size,
resample=resample,
)
data = {}
if images is not None:
pixel_values, vision_grid_thws = [], []
for image in images:
patches, image_grid_thw = self._preprocess(
image,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
patch_size=patch_size,
temporal_patch_size=temporal_patch_size,
merge_size=merge_size,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
)
pixel_values.extend(patches)
vision_grid_thws.append(image_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data.update(
{"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
)
# kept for BC only and should be removed after v5.0
if videos is not None:
logger.warning(
"`HunYuanVLV1ImageProcessor` works only with image inputs "
"and doesn't process videos anymore. "
"This is a deprecated behavior and will be removed in v5.0. "
"Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
)
videos = make_batched_videos(videos)
pixel_values_videos, vision_grid_thws_videos = [], []
for images in videos:
patches, video_grid_thw = self._preprocess(
images,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
patch_size=patch_size,
temporal_patch_size=temporal_patch_size,
merge_size=merge_size,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
)
pixel_values_videos.extend(patches)
vision_grid_thws_videos.append(video_grid_thw)
data.update(
{
"pixel_values_videos": np.array(pixel_values_videos),
"video_grid_thw": np.array(vision_grid_thws_videos),
}
)
return BatchFeature(data=data, tensor_type=return_tensors)
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
"""
A utility that returns number of image patches for a given image size.
Args:
height (`int`):
Height of the input image.
width (`int`):
Width of the input image.
images_kwargs (`dict`, *optional*):
Any kwargs to override defaults of the image processor.
Returns:
`int`: Number of image patches per image.
"""
min_pixels = (
images_kwargs["min_pixels"]
if "min_pixels" in images_kwargs
else self.size["shortest_edge"]
)
max_pixels = (
images_kwargs["max_pixels"]
if "max_pixels" in images_kwargs
else self.size["longest_edge"]
)
patch_size = images_kwargs.get("patch_size", self.patch_size)
merge_size = images_kwargs.get("merge_size", self.merge_size)
factor = patch_size * merge_size
resized_height, resized_width = smart_resize(
height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
)
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
return grid_h * (grid_w + 1) + 2
AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)

View File

@@ -0,0 +1,458 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import cached_property
import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from vllm.multimodal.image import convert_image_mode
__all__ = ["OvisProcessor"]
IGNORE_ID = -100
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"text_kwargs": {
"padding": False,
},
"images_kwargs": {
"do_convert_rgb": True,
"return_tensors": "pt",
},
}
class OvisProcessor(ProcessorMixin):
r"""
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
[`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
Args:
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(
self,
image_processor=None,
tokenizer=None,
chat_template=None,
image_pad_token=None,
image_segment_len=255,
**kwargs,
):
self.image_token = "<image>"
self.image_pad_token = image_pad_token
self.image_segment_len = image_segment_len
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@cached_property
def extra_special_tokens(self):
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
extra_special_tokens = {
"image_token": -200,
"image_atom": -300,
"image_start": -301,
"image_prefix": -302,
"image_col_sep": -303,
"image_row_sep": -304,
"image_end": -305,
"image_pad": image_pad_token_id,
}
return extra_special_tokens
def __call__(
self,
images: ImageInput = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
**kwargs: Unpack[OvisProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
"""
max_partition = kwargs.pop("max_partition", 9)
covering_threshold = kwargs.pop("covering_threshold", 0.9)
output_kwargs = self._merge_kwargs(
OvisProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# Process all images first
image_features = {}
if images is not None:
processed_images = []
image_placeholders_list = []
grids = []
# Process each image
for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_image(
image=image,
max_partition=max_partition,
covering_threshold=covering_threshold,
**output_kwargs["images_kwargs"],
)
processed_images.append(pixel_values)
image_placeholders_list.append(image_placeholders)
grids.append(grid)
# assign all processed images
if processed_images:
image_features["image_placeholders"] = image_placeholders_list
# Process text input
if text is not None:
if not isinstance(text, list):
text = [text]
tokenized_batched_text = self._tokenize_with_image_symbol(text)
image_token_id = self.get_token_value("image_token")
replaced_ids_list = []
idx = 0
for ids_tensor in tokenized_batched_text:
if (
image_token_id in ids_tensor
and "image_placeholders" in image_features
):
if idx < len(image_features["image_placeholders"]):
# Converts in list for ease of use
ids_list = ids_tensor.tolist()
new_ids = []
# replace placeholders
for i, token_id in enumerate(ids_list):
if token_id == image_token_id:
placeholder_ids = image_features["image_placeholders"][
idx
]
new_ids.extend(placeholder_ids)
idx += 1
else:
new_ids.append(token_id)
# Converts back to tensors
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
else:
raise RuntimeError(
"Mismatch between the images you provided and the number of placeholder present in the text"
)
replaced_ids_list.append(ids_tensor)
if replaced_ids_list:
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
else:
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
# Create the output with text features
output = BatchFeature(
data={
"input_ids": replaced_and_tokenized_ids,
}
)
# Add image features if present
if image_features:
output["pixel_values"] = processed_images
output["grids"] = grids
return output
# If only images were provided
return BatchFeature(data=image_features)
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = []
for text in text_list:
text_chunks = [
self.tokenizer(chunk, add_special_tokens=False).input_ids
for chunk in text.split(self.image_token)
]
token_ids = []
num_chuck = len(text_chunks)
for i, chunk in enumerate(text_chunks):
token_ids.extend(chunk)
if i < num_chuck - 1:
token_ids.append(self.get_token_value("image_token"))
batch_token_ids.append(token_ids)
return torch.tensor(batch_token_ids, dtype=torch.long)
def get_image_size(self):
size = self.image_processor.size
if "shortest_edge" in size:
width = height = size["shortest_edge"]
elif "height" in size and "width" in size:
width = size["width"]
height = size["height"]
else:
raise ValueError("Can't parse image size from image_processor config.")
return height, width
def get_token_value(self, tok):
return self.extra_special_tokens[tok]
def construct_image_indicators(self, grid):
image_placeholders = [
self.get_token_value("image_start"),
self.get_token_value("image_atom"),
self.get_token_value("image_prefix"),
]
if grid[0] * grid[1] > 1:
for r in range(grid[0]):
for c in range(grid[1]):
image_placeholders.append(self.get_token_value("image_atom"))
if c < grid[1] - 1:
image_placeholders.append(self.get_token_value("image_col_sep"))
if r < grid[0] - 1:
image_placeholders.append(self.get_token_value("image_row_sep"))
image_placeholders.append(self.get_token_value("image_end"))
return image_placeholders
def construct_image_placeholders(self, grid):
image_placeholders = self.construct_image_indicators(grid)
image_atom_token_id = self.get_token_value("image_atom")
# Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value("image_pad")
# Create a new list with padding tokens inserted
padded_placeholder_tokens = []
for token in image_placeholders:
padded_placeholder_tokens.append(image_padding_token_id)
if token == image_atom_token_id:
padded_placeholder_tokens.extend(
[image_padding_token_id] * self.image_segment_len
)
return padded_placeholder_tokens
def preprocess_image(
self,
image: PIL.Image.Image,
max_partition,
covering_threshold,
do_convert_rgb,
return_tensors,
):
def _preprocess(img: PIL.Image.Image, side):
# first resize and preprocess
w, h = img.size
if w == h:
new_width = new_height = side
elif w > h:
new_width = side
new_height = int(h / w * new_width)
else:
new_height = side
new_width = int(w / h * new_height)
new_size = dict(height=new_height, width=new_width)
pixel_values = self.image_processor.preprocess(
img, size=new_size, return_tensors=return_tensors
)["pixel_values"]
# then pad to square
square_values = torch.zeros(
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
)
new_height, new_width = pixel_values.shape[2:]
if new_height == new_width:
square_values[:, :, :, :] = pixel_values
elif new_height > new_width:
from_index = (side - new_width) // 2
square_values[:, :, :, from_index : from_index + new_width] = (
pixel_values
)
else:
from_index = (side - new_height) // 2
square_values[:, :, from_index : from_index + new_height, :] = (
pixel_values
)
return square_values
def _partition(img, grid) -> list[tuple[int, int, int, int]]:
w, h = img.size
row_height = h // grid[0]
col_width = w // grid[1]
partition = []
for row in range(grid[0]):
for col in range(grid[1]):
left = col * col_width
upper = row * row_height
right = w if col == grid[1] - 1 else (col + 1) * col_width
lower = h if row == grid[0] - 1 else (row + 1) * row_height
partition.append((left, upper, right, lower))
return partition
def _covering_area(left, upper, right, lower, side):
w = right - left
h = lower - upper
w, h = max(w, h), min(w, h)
if w > side:
h = h / w * side
w = side
return w * h
def _get_best_grid(img, side):
img_area = img.size[0] * img.size[1]
candidate_grids = []
for i in range(1, max_partition + 1):
for j in range(1, max_partition + 1):
if i * j <= max_partition:
candidate_grids.append((i, j))
all_grids = []
good_grids = []
for grid in candidate_grids:
partition = _partition(img, grid)
covering_ratio = (
sum([_covering_area(*p, side) for p in partition]) / img_area
)
assert covering_ratio <= 1.0
all_grids.append((grid, covering_ratio))
if covering_ratio > covering_threshold:
good_grids.append((grid, covering_ratio))
if len(good_grids) > 0:
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
0
]
else:
# pick the partition with maximum covering_ratio and break the tie using #sub_images
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
if do_convert_rgb:
image = convert_image_mode(image, "RGB")
sides = self.get_image_size()
if sides[0] != sides[1]:
raise ValueError("get_image_size() returns non-square size")
side = sides[0]
grid = _get_best_grid(image, side)
partition = _partition(image, grid)
crops = [image.crop(p) for p in partition]
if len(crops) > 1:
crops.insert(0, image)
pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
image_placeholders = self.construct_image_placeholders(grid)
return torch.tensor(pixel_values), image_placeholders, torch.tensor(grid)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def post_process_image_text_to_text(self, generated_outputs):
"""
Post-process the output of the model to decode the text.
Args:
generated_outputs (`torch.Tensor` or `np.ndarray`):
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
or `(sequence_length,)`.
Returns:
`list[str]`: The decoded text.
"""
return self.tokenizer.batch_decode(
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
names_from_processor = list(
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
)
return names_from_processor + ["second_per_grid_ts"]
AutoProcessor.register("OvisProcessor", OvisProcessor)

View File

@@ -0,0 +1,481 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from functools import cached_property
import numpy as np
import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
__all__ = ["Ovis2_5Processor"]
IMAGE_TOKEN = "<image>"
VIDEO_TOKEN = "<video>"
MIN_PIXELS = 448 * 448
MAX_PIXELS = 1792 * 1792
class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"text_kwargs": {
"padding": False,
},
"images_kwargs": {
"do_convert_rgb": True,
},
"videos_kwargs": {
"do_convert_rgb": True,
},
}
class Ovis2_5Processor(ProcessorMixin):
r"""
Constructs an Ovis processor which wraps an Ovis image processor
and a Qwen2 tokenizer into a single processor.
[`OvisProcessor`] offers all the functionalities of
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
for more information.
Args:
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will
be used to convert lists of messages in a chat into
a tokenizable string.
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template", "image_pad_token"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(
self,
image_processor=None,
tokenizer=None,
chat_template=None,
image_pad_token=None,
patch_size=16,
hidden_stride=2,
temporal_patch_size=1,
**kwargs,
):
self.image_token = IMAGE_TOKEN
self.video_token = VIDEO_TOKEN
self.image_pad_token = "<|image_pad|>"
self.patch_size = patch_size
self.hidden_stride = hidden_stride
self.temporal_patch_size = temporal_patch_size
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@cached_property
def extra_special_tokens(self):
vocab = self.tokenizer.get_vocab()
required_tokens = {
"image_token": "<image>",
"video_token": "<video>",
"visual_atom": "<ovis_visual_atom>",
"image_start": "<ovis_image_start>",
"image_end": "<ovis_image_end>",
"video_start": "<ovis_video_start>",
"video_end": "<ovis_video_end>",
"image_pad": "<|image_pad|>",
}
extra_special_tokens = {}
suggestion = (
"please add '<image>', '<video>', '<ovis_visual_atom>', "
"'<ovis_image_start>', '<ovis_image_end>', '<ovis_video_start>', "
"'<ovis_video_end>' in 'additional_special_tokens' of "
"tokenizer_config.json, You can refer to "
"https://huggingface.co/AIDC-AI/Ovis2.6-30B-A3B/blob/main/tokenizer_config.json"
)
for key, token_name in required_tokens.items():
if token_name not in vocab:
raise ValueError(f"Can not find {token_name}, {suggestion}")
extra_special_tokens[key] = vocab[token_name]
return extra_special_tokens
def __call__(
self,
images: ImageInput = None,
videos: np.ndarray | list[ImageInput] = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
**kwargs: Unpack[Ovis2_5ProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s)
and image(s). This method forwards the `text`and `kwargs` arguments
to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
is not `None` to encode the text. To prepare the vision inputs,
this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
`list[PIL.Image.Image]`, `list[np.ndarray]`,
`list[torch.Tensor]`):
The image or batch of images to be prepared.
Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats
are supported.
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded.
Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as
list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with
a batch of sequences).
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
`list[torch.Tensor]`):
The image or batch of videos to be prepared. Each video
can be a 4D NumPy array or PyTorch tensor, or a nested
list of 3D frames. Both channels-first and channels-last
formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework.
Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- list of token ids to be fed to a model.
Returned when `text` is not `None`.
- **attention_mask** -- list of indices specifying which tokens
should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"*
is in `self.model_input_names` and if `text` is not `None`).
- **pixel_values** -- Pixel values to be fed to a model.
Returned when `images` is not `None`.
- **pixel_values_videos** -- Pixel values of videos to be fed to
a model. Returned when `videos` is not `None`.
- **image_grid_thw** -- list of image 3D grid in LLM. Returned
when `images` is not `None`.
- **video_grid_thw** -- list of video 3D grid in LLM. Returned
when `videos` is not `None`.
- **second_per_grid_ts** -- list of video seconds per time grid.
Returned when `videos` is not `None`.
"""
output_kwargs = self._merge_kwargs(
Ovis2_5ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# Process all images first
visual_features = {}
output = BatchFeature()
if images is not None:
processed_images = []
image_placeholders_list = []
grids = []
# Process each image
for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_multidata(
images=image,
**output_kwargs["images_kwargs"],
)
processed_images.append(pixel_values)
image_placeholders_list.append(image_placeholders)
grids.append(grid)
# assign all processed images
if processed_images:
visual_features["image_placeholders"] = image_placeholders_list
output["pixel_values"] = processed_images
output["grids"] = grids
if videos is not None:
processed_videos = []
videos_placeholders_list = []
grids = []
# Process each video
for video in videos if isinstance(videos, list) else [videos]:
pixel_values, video_placeholders, grid = self.preprocess_multidata(
video=video,
**output_kwargs["videos_kwargs"],
)
processed_videos.append(pixel_values)
videos_placeholders_list.append(video_placeholders)
grids.append(grid)
# assign all processed videos
if processed_videos:
visual_features["video_placeholders"] = videos_placeholders_list
output["video_pixel_values"] = processed_videos
output["video_grids"] = grids
# Process text input
if text is not None:
if not isinstance(text, list):
text = [text]
tokenized_batched_text = self._tokenize_with_visual_symbol(text)
image_token_id = self.get_token_value("image_token")
video_token_id = self.get_token_value("video_token")
replaced_ids_list = []
image_idx = 0
video_idx = 0
for ids_tensor in tokenized_batched_text:
has_image_tokens = (
image_token_id in ids_tensor
and "image_placeholders" in visual_features
and image_idx < len(visual_features["image_placeholders"])
)
has_video_tokens = (
video_token_id in ids_tensor
and "video_placeholders" in visual_features
and video_idx < len(visual_features["video_placeholders"])
)
if has_image_tokens or has_video_tokens:
# Convert to list for easier manipulation
ids_list = ids_tensor.tolist()
new_ids = []
# Replace placeholders
for token_id in ids_list:
if token_id == image_token_id:
new_ids.extend(
visual_features["image_placeholders"][image_idx]
)
image_idx += 1
elif token_id == video_token_id:
new_ids.extend(
visual_features["video_placeholders"][video_idx]
)
video_idx += 1
else:
new_ids.append(token_id)
# Convert back to tensor
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
replaced_ids_list.append(ids_tensor)
if replaced_ids_list:
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
else:
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
output["input_ids"] = replaced_and_tokenized_ids
return output
# If only images were provided
return BatchFeature(data=visual_features)
def _tokenize_with_visual_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = []
for text in text_list:
token_ids = []
video_token_id = self.get_token_value("video_token")
image_token_id = self.get_token_value("image_token")
video_split_texts = text.split(self.video_token)
for j, video_segment in enumerate(video_split_texts):
image_split_texts = video_segment.split(self.image_token)
text_chunks = [
self.tokenizer(chunk, add_special_tokens=False).input_ids
for chunk in image_split_texts
]
segment_tokens = []
for i, chunk in enumerate(text_chunks):
segment_tokens.extend(chunk)
if i < len(text_chunks) - 1:
segment_tokens.append(image_token_id)
token_ids.extend(segment_tokens)
if j < len(video_split_texts) - 1:
token_ids.append(video_token_id)
batch_token_ids.append(token_ids)
return torch.tensor(batch_token_ids, dtype=torch.long)
# Copied from qwen2_vl
def smart_resize(
self,
height: int,
width: int,
factor: int = 28,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range
['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if height < factor or width < factor:
print(
f"height:{height} or width:{width} must be larger than factor:{factor}"
)
if height < width:
width = round(factor / height * width)
height = factor
else:
height = round(factor / width * height)
width = factor
elif max(height, width) / min(height, width) > 200:
print(
f"absolute aspect ratio must be smaller than 200, "
f"got {max(height, width) / min(height, width)}"
)
if height > width:
height = 200 * width
else:
width = 200 * height
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = math.floor(height / beta / factor) * factor
w_bar = math.floor(width / beta / factor) * factor
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
def get_token_value(self, tok):
return self.extra_special_tokens[tok]
def construct_visual_indicators(self, grid, is_video: bool = False):
if is_video:
start_token = self.get_token_value("video_start")
end_token = self.get_token_value("video_end")
else:
start_token = self.get_token_value("image_start")
end_token = self.get_token_value("image_end")
image_placeholders = [start_token, self.get_token_value("visual_atom")]
if grid[0] * grid[1] > 1:
for r in range(grid[0]):
for c in range(grid[1]):
image_placeholders.append(self.get_token_value("visual_atom"))
image_placeholders.append(end_token)
return image_placeholders
def construct_visual_placeholders(self, grid, is_video: bool = False):
visual_placeholders = self.construct_visual_indicators((1, 1), is_video)
image_atom_token_id = self.get_token_value("visual_atom")
# Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value("image_pad")
num_image_atoms = grid[0] * grid[1] * grid[2]
num_image_atoms //= self.hidden_stride**2
num_image_atoms //= self.temporal_patch_size
# Create a new list with padding tokens inserted
padded_placeholder_tokens = []
for token in visual_placeholders:
if token == image_atom_token_id:
padded_placeholder_tokens.extend(
[image_padding_token_id] * num_image_atoms
)
else:
padded_placeholder_tokens.append(image_padding_token_id)
return padded_placeholder_tokens
def preprocess_multidata(
self,
images: PIL.Image.Image | list[PIL.Image.Image] | None = None,
video: list[PIL.Image.Image] | np.ndarray | None = None,
do_convert_rgb: bool | None = True,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
return_tensors: str | None = "pt",
):
is_video = False
if images is not None:
if not isinstance(images, list):
images = [images]
elif video is not None:
is_video = True
# type of vidoe in dummy_mm_data is np.ndarray
if isinstance(video, np.ndarray):
images = []
for i in range(video.shape[0]):
image = PIL.Image.fromarray(video[i].astype(np.uint8))
images.append(image)
elif isinstance(video, list):
images = video
else:
raise ValueError("Either images or video should be provided.")
min_pixels = min(
max_pixels if max_pixels is not None else MAX_PIXELS,
min_pixels if min_pixels is not None else MIN_PIXELS,
)
images = [
image.convert("RGB") if do_convert_rgb and image.mode != "RGB" else image
for image in images
]
width, height = images[0].size
resized_height, resized_width = height, width
processed_images = []
for image in images:
resized_height, resized_width = self.smart_resize(
height,
width,
factor=self.patch_size * self.hidden_stride,
min_pixels=min_pixels,
max_pixels=max_pixels,
)
new_size = dict(height=resized_height, width=resized_width)
image_pt = self.image_processor.preprocess(image, size=new_size)[
"pixel_values"
][0]
processed_images.append(image_pt)
patches = np.array(processed_images)
if patches.shape[0] % self.temporal_patch_size != 0:
num_to_pad = self.temporal_patch_size - (
patches.shape[0] % self.temporal_patch_size
)
repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
patches = np.concatenate([patches, repeats], axis=0)
channel = patches.shape[1]
grid_t = patches.shape[0] // self.temporal_patch_size
grid_h = resized_height // self.patch_size
grid_w = resized_width // self.patch_size
patches = patches.reshape(
grid_t,
self.temporal_patch_size,
channel,
grid_h // self.hidden_stride,
self.hidden_stride,
self.patch_size,
grid_w // self.hidden_stride,
self.hidden_stride,
self.patch_size,
)
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
flatten_patches = patches.reshape(
grid_t * grid_h * grid_w,
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
)
visual_placeholders = self.construct_visual_placeholders(
[grid_t, grid_h, grid_w], is_video
)
return (
torch.tensor(flatten_patches),
visual_placeholders,
torch.tensor([[grid_t, grid_h, grid_w]]),
)
AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)

View File

@@ -0,0 +1,232 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
# mypy: ignore-errors
# coding=utf-8
# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import regex as re
import numpy as np
from transformers import AutoProcessor
from transformers.audio_utils import AudioInput
from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin
from transformers.tokenization_utils_base import TextInput
class Qwen3ASRProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {
"padding": False,
"padding_side": "left",
},
"audio_kwargs": {
"sampling_rate": 16000,
"padding": True,
"return_attention_mask": True,
},
}
def _get_feat_extract_output_lengths(input_lengths):
"""
Computes the output length of the convolutional layers and the output length of the audio encoder
"""
input_lengths_leave = input_lengths % 100
feat_lengths = (input_lengths_leave - 1) // 2 + 1
output_lengths = (
((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
)
return output_lengths
class Qwen3ASRProcessor(ProcessorMixin):
r"""
Constructs a Qwen3ASR processor.
[`Qwen3ASRProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`], and [`Qwen2TokenizerFast`]. See the
[`~Qwen3ASRProcessor.__call__`] and [`~Qwen3ASRProcessor.decode`] for more information.
Args:
feature_extractor ([`WhisperFeatureExtractor`], *optional*):
The audio feature extractor.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The text tokenizer.
chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
"""
attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "WhisperFeatureExtractor"
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
def __init__(self, feature_extractor=None, tokenizer=None, chat_template=None):
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
self.audio_token = self.tokenizer.audio_token
self.audio_bos_token = self.tokenizer.audio_bos_token
self.audio_eos_token = self.tokenizer.audio_eos_token
def __call__(
self,
text: TextInput = None,
audio: AudioInput = None,
**kwargs,
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the audio(s), this method forwards the `audio` and `kwargs` arguments to
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audio` is not `None`. Please refer to the doctsring
of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
audio (`np.ndarray`, `List[np.ndarray]`):
The audio or batch of audio to be prepared. Each audio can be a NumPy array.
"""
if text is None:
raise ValueError("You need to specify either a `text` input to process.")
output_kwargs = self._merge_kwargs(
Qwen3ASRProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if audio is not None:
output_kwargs["audio_kwargs"]["padding"] = True
output_kwargs["audio_kwargs"]["truncation"] = False
audio_inputs = self.feature_extractor(
audio, **output_kwargs["audio_kwargs"]
)
audio_inputs["feature_attention_mask"] = audio_inputs.pop(
"attention_mask"
) # rename feature_attention_mask to prevent conflicts later on
audio_inputs["input_features"] = audio_inputs.pop(
"input_features"
) # rename input_features to prevent conflicts later on
audio_lengths = iter(
_get_feat_extract_output_lengths(
audio_inputs["feature_attention_mask"].sum(-1)
)
)
else:
audio_inputs = {}
audio_lengths = iter([])
if not isinstance(text, list):
text = [text]
text = self.replace_multimodal_special_tokens(
text,
audio_lengths,
)
texts_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
return BatchFeature(
data={**texts_inputs, **audio_inputs},
tensor_type=kwargs.get("return_tensors"),
)
def replace_multimodal_special_tokens(
self,
text,
audio_lengths,
):
processed_text = []
for sample in text:
positions = []
special_tokens = [re.escape(tok) for tok in [self.audio_token]]
pattern = "|".join(special_tokens)
positions = sorted(
[
(match.start(), match.group())
for match in re.finditer(pattern, sample)
]
)
positions.sort(key=lambda x: x[0])
for _, special_token in positions:
if special_token == self.audio_token:
sample = sample.replace(
self.audio_token,
"<|audio_placeholder|>" * next(audio_lengths),
1,
)
sample = sample.replace("<|audio_placeholder|>", self.audio_token)
processed_text.append(sample)
return processed_text
def get_chunked_index(
self, token_indices: np.ndarray, tokens_per_chunk: int
) -> list[tuple[int, int]]:
"""
Splits token index list into chunks based on token value ranges.
Given a list of token indices, returns a list of (start, end) index tuples representing
slices of the list where the token values fall within successive ranges of `tokens_per_chunk`.
For example, if `tokens_per_chunk` is 1000, the function will create chunks such that:
- the first chunk contains token values < 1000,
- the second chunk contains values >= 1000 and < 2000, and so on.
Parameters:
token_indices (`np.ndarray`): A monotonically increasing list of token index values.
tokens_per_chunk (`int`): Number of tokens per chunk (used as the chunk size threshold).
Returns:
`list[tuple[int, int]]`: A list of tuples, each representing the start (inclusive)
and end (exclusive) indices of a chunk in `token_indices`.
"""
def _iter():
i, start_idx = 0, 0 # skip bos token
current_chunk = 1
while i < len(token_indices): # skip eos token
if token_indices[i] >= current_chunk * tokens_per_chunk:
yield (start_idx, i)
start_idx = i
current_chunk += 1
i += 1
yield (start_idx, len(token_indices))
return list(_iter())
def apply_chat_template(self, conversations, chat_template=None, **kwargs):
kwargs["return_dict"] = False
return super().apply_chat_template(conversations, chat_template, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(
dict.fromkeys(
tokenizer_input_names
+ feature_extractor_input_names
+ ["feature_attention_mask"]
)
)
AutoProcessor.register("Qwen3ASRProcessor", Qwen3ASRProcessor)