初始化项目,由ModelHub XC社区提供模型
Model: naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B Source: Original Platform
This commit is contained in:
912
processing_hyperclovax.py
Normal file
912
processing_hyperclovax.py
Normal file
@@ -0,0 +1,912 @@
|
||||
import copy
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL
|
||||
from PIL import Image
|
||||
import torch
|
||||
from transformers.feature_extraction_utils import BatchFeature
|
||||
from transformers.image_utils import ImageInput, load_image
|
||||
from transformers.processing_utils import (
|
||||
AllKwargsForChatTemplate,
|
||||
ChatTemplateLoadKwargs,
|
||||
ProcessingKwargs,
|
||||
ProcessorMixin,
|
||||
Unpack,
|
||||
)
|
||||
from transformers.tokenization_utils_base import AudioInput, TextInput
|
||||
from transformers.utils import (
|
||||
is_torch_device,
|
||||
is_torch_dtype,
|
||||
logging,
|
||||
requires_backends,
|
||||
)
|
||||
from transformers.utils.chat_template_utils import render_jinja_template
|
||||
from transformers.video_utils import VideoInput, VideoMetadata, load_video
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class HCXBatchFeature(BatchFeature):
|
||||
def to(self, *args, **kwargs) -> "BatchFeature":
|
||||
"""
|
||||
Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
|
||||
different `dtypes` and sending the `BatchFeature` to a different `device`.
|
||||
|
||||
Args:
|
||||
args (`Tuple`):
|
||||
Will be passed to the `to(...)` function of the tensors.
|
||||
kwargs (`Dict`, *optional*):
|
||||
Will be passed to the `to(...)` function of the tensors.
|
||||
To enable asynchronous data transfer, set the `non_blocking` flag in `kwargs` (defaults to `False`).
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: The same instance after modification.
|
||||
"""
|
||||
requires_backends(self, ["torch"])
|
||||
import torch # noqa
|
||||
|
||||
new_data = {}
|
||||
device = kwargs.get("device")
|
||||
non_blocking = kwargs.get("non_blocking", False)
|
||||
# Check if the args are a device or a dtype
|
||||
if device is None and len(args) > 0:
|
||||
# device should be always the first argument
|
||||
arg = args[0]
|
||||
if is_torch_dtype(arg):
|
||||
# The first argument is a dtype
|
||||
pass
|
||||
elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
|
||||
device = arg
|
||||
else:
|
||||
# it's something else
|
||||
raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
|
||||
# We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
|
||||
for k, v in self.items():
|
||||
# check if v is a floating point
|
||||
if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
|
||||
# cast and send to device
|
||||
new_data[k] = v.to(*args, **kwargs)
|
||||
elif isinstance(v, torch.Tensor) and device is not None:
|
||||
new_data[k] = v.to(device=device, non_blocking=non_blocking)
|
||||
elif "pixel_values" in k:
|
||||
new_pixel_values_batch = []
|
||||
for _v in v:
|
||||
pixel_values = [pixel_value.to(device=device, non_blocking=non_blocking) for pixel_value in _v]
|
||||
new_pixel_values_batch.append(pixel_values)
|
||||
new_data[k] = new_pixel_values_batch
|
||||
else:
|
||||
new_data[k] = v
|
||||
self.data = new_data
|
||||
return self
|
||||
|
||||
|
||||
class HCXProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"return_tensors": "pt",
|
||||
"calc_non_vision_query_lengths": False,
|
||||
},
|
||||
"images_kwargs": {},
|
||||
"audio_kwargs": {},
|
||||
"videos_kwargs": {
|
||||
"max_image_cnt": 12,
|
||||
"max_num_grids": 9,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class HCXProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
|
||||
|
||||
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
||||
self.image_token = "<|dummy3|>"
|
||||
self.video_token = "<|_unuse_missing_100270|>"
|
||||
self.image_token_pattern = re.compile(r"<\|dummy3\|>")
|
||||
self.video_token_pattern = re.compile(r"<\|_unuse_missing_100270\|>")
|
||||
self.image_video_token_pattern = re.compile(r"<\|dummy3\|>|<\|_unuse_missing_100270\|>")
|
||||
self.image_token_id = (
|
||||
tokenizer.image_token_id
|
||||
if getattr(tokenizer, "image_token_id", None)
|
||||
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||
)
|
||||
self.video_token_id = (
|
||||
tokenizer.video_token_id
|
||||
if getattr(tokenizer, "video_token_id", None)
|
||||
else tokenizer.convert_tokens_to_ids(self.video_token)
|
||||
)
|
||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||
|
||||
def apply_chat_template(
|
||||
self,
|
||||
conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
|
||||
chat_template: Optional[str] = None,
|
||||
**kwargs: Unpack[AllKwargsForChatTemplate],
|
||||
) -> str:
|
||||
"""
|
||||
Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
|
||||
conversations to turn them into a single tokenizable string.
|
||||
|
||||
The input is expected to be in the following format, where each message content is a list consisting of text and
|
||||
optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
|
||||
`pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
|
||||
{"type": "text", "text": "Please describe this image in detail."},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
Args:
|
||||
conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
|
||||
The conversation to format.
|
||||
chat_template (`Optional[str]`, *optional*):
|
||||
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
|
||||
chat template is used.
|
||||
"""
|
||||
|
||||
if chat_template is None:
|
||||
if isinstance(self.chat_template, dict) and "default" in self.chat_template:
|
||||
chat_template = self.chat_template["default"]
|
||||
elif isinstance(self.chat_template, dict):
|
||||
raise ValueError(
|
||||
'The processor has multiple chat templates but none of them are named "default". You need to specify'
|
||||
" which one to use by passing the `chat_template` argument. Available templates are: "
|
||||
f"{', '.join(self.chat_template.keys())}"
|
||||
)
|
||||
elif self.chat_template is not None:
|
||||
chat_template = self.chat_template
|
||||
else:
|
||||
raise ValueError(
|
||||
"Cannot use apply_chat_template because this processor does not have a chat template."
|
||||
)
|
||||
else:
|
||||
if isinstance(self.chat_template, dict) and chat_template in self.chat_template:
|
||||
# It's the name of a template, not a full template string
|
||||
chat_template = self.chat_template[chat_template]
|
||||
else:
|
||||
# It's a template string, render it directly
|
||||
chat_template = chat_template
|
||||
|
||||
if kwargs.get("continue_final_message", False):
|
||||
if kwargs.get("add_generation_prompt", False):
|
||||
raise ValueError(
|
||||
"continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead."
|
||||
)
|
||||
if kwargs.get("return_assistant_tokens_mask", False):
|
||||
raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.")
|
||||
|
||||
# Fill sets of kwargs that should be used by different parts of template
|
||||
processed_kwargs = {
|
||||
"mm_load_kwargs": {},
|
||||
"template_kwargs": {},
|
||||
}
|
||||
|
||||
for kwarg_type in processed_kwargs:
|
||||
for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
|
||||
kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
|
||||
default_value = getattr(kwarg_type_defaults, key, None)
|
||||
value = kwargs.pop(key, default_value)
|
||||
if value is not None and not isinstance(value, dict):
|
||||
processed_kwargs[kwarg_type][key] = value
|
||||
|
||||
# Pass unprocessed custom kwargs
|
||||
processed_kwargs["template_kwargs"].update(kwargs)
|
||||
|
||||
if isinstance(conversation, (list, tuple)) and (
|
||||
isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
|
||||
):
|
||||
is_batched = True
|
||||
conversations = conversation
|
||||
else:
|
||||
is_batched = False
|
||||
conversations = [conversation]
|
||||
|
||||
tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
|
||||
return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
|
||||
mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
|
||||
|
||||
if tokenize:
|
||||
batch_images, batch_videos = [], []
|
||||
batch_audios = []
|
||||
batch_video_metadata = []
|
||||
for conversation in conversations:
|
||||
images, videos = [], []
|
||||
video_metadata = []
|
||||
for message in conversation:
|
||||
visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
|
||||
audio_fnames = [
|
||||
content[key]
|
||||
for content in message["content"]
|
||||
for key in ["audio", "url", "path"]
|
||||
if key in content and content["type"] == "audio"
|
||||
]
|
||||
image_fnames = [
|
||||
vision_info[key]
|
||||
for vision_info in visuals
|
||||
for key in ["image", "url", "path", "base64"]
|
||||
if key in vision_info and vision_info["type"] == "image"
|
||||
]
|
||||
video_fnames = [
|
||||
vision_info[key]
|
||||
for vision_info in visuals
|
||||
for key in ["video", "url", "path"]
|
||||
if key in vision_info and vision_info["type"] == "video"
|
||||
]
|
||||
|
||||
for fname in image_fnames:
|
||||
images.append(load_image(fname))
|
||||
|
||||
# Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
|
||||
if not mm_load_kwargs["load_audio_from_video"]:
|
||||
for fname in audio_fnames:
|
||||
batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
|
||||
else:
|
||||
for fname in video_fnames:
|
||||
batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
|
||||
|
||||
for fname in video_fnames:
|
||||
if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
|
||||
video = [np.array(load_image(image_fname)) for image_fname in fname]
|
||||
# create a 4D video because `load_video` always returns a 4D array
|
||||
video = np.stack(video)
|
||||
metadata = None
|
||||
logger.warning(
|
||||
"When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
|
||||
"If your model uses this metadata during processing, please load the whole video and let the model sample frames instead."
|
||||
)
|
||||
else:
|
||||
# TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added
|
||||
video, metadata = self._load_video_for_model(
|
||||
fname,
|
||||
num_frames=mm_load_kwargs.get("num_frames", None),
|
||||
fps=mm_load_kwargs.get("video_fps", None),
|
||||
backend=mm_load_kwargs["video_load_backend"],
|
||||
**kwargs,
|
||||
)
|
||||
videos.append(video)
|
||||
video_metadata.append(metadata)
|
||||
|
||||
# Currently all processors can accept nested list of batches, but not flat list of visuals
|
||||
# So we'll make a batched list of images and let the processor handle it
|
||||
if images:
|
||||
batch_images.append(images)
|
||||
if videos:
|
||||
batch_videos.append(videos)
|
||||
batch_video_metadata.append(video_metadata)
|
||||
|
||||
# Process conversation with video/image information if needed. Then convert into a prompt using Jinja template
|
||||
conversations = self._process_messages_for_chat_template(
|
||||
conversations,
|
||||
batch_images=batch_images,
|
||||
batch_videos=batch_videos,
|
||||
batch_video_metadata=batch_video_metadata,
|
||||
**processed_kwargs["mm_load_kwargs"],
|
||||
)
|
||||
|
||||
prompt, generation_indices = render_jinja_template(
|
||||
conversations=conversations,
|
||||
chat_template=chat_template,
|
||||
**processed_kwargs["template_kwargs"], # different flags such as `return_assistant_mask`
|
||||
**self.tokenizer.special_tokens_map, # tokenizer special tokens are used by some templates
|
||||
)
|
||||
|
||||
if not is_batched:
|
||||
prompt = prompt[0]
|
||||
|
||||
if tokenize:
|
||||
# Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
|
||||
# But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
|
||||
# and pass it to the processor. Users thus never worried about special tokens relying on processor handling
|
||||
# everything internally. The below line is to keep BC for that and be able to work with model that have
|
||||
# special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
|
||||
# without actionable solution for users
|
||||
single_prompt = prompt[0] if is_batched else prompt
|
||||
if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
|
||||
kwargs["add_special_tokens"] = False
|
||||
|
||||
out = self(
|
||||
text=prompt,
|
||||
images=batch_images if batch_images else None,
|
||||
videos=batch_videos if batch_videos else None,
|
||||
audio=batch_audios if batch_audios else None,
|
||||
**kwargs,
|
||||
)
|
||||
if return_dict:
|
||||
if processed_kwargs["template_kwargs"].get("return_assistant_tokens_mask", False):
|
||||
assistant_masks = []
|
||||
input_ids = out["input_ids"]
|
||||
for i in range(len(input_ids)):
|
||||
current_mask = [0] * len(input_ids[i])
|
||||
for assistant_start_char, assistant_end_char in generation_indices[i]:
|
||||
start_token = out.char_to_token(i, assistant_start_char)
|
||||
end_token = out.char_to_token(i, assistant_end_char - 1)
|
||||
if start_token is None:
|
||||
# start_token is out of bounds maybe due to truncation.
|
||||
break
|
||||
for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])):
|
||||
current_mask[token_id] = 1
|
||||
assistant_masks.append(current_mask)
|
||||
out["assistant_masks"] = assistant_masks
|
||||
out.convert_to_tensors(tensor_type=kwargs.get("return_tensors", None))
|
||||
|
||||
# vllm needs vision_query_lengths, but hf model doesn't need it
|
||||
del out["vision_query_lengths_images"]
|
||||
del out["vision_query_lengths_videos"]
|
||||
return out
|
||||
else:
|
||||
return out["input_ids"]
|
||||
|
||||
def repeat_dummy_tokens(self, input_ids, target_token_id, vision_query_lengths):
|
||||
input_ids = input_ids.clone().detach()
|
||||
batch_indices, target_indices = torch.where(input_ids == target_token_id)
|
||||
batch_size = input_ids.shape[0]
|
||||
|
||||
new_input_ids = [[] for _ in range(batch_size)]
|
||||
start_indices = [0 for _ in range(batch_size)]
|
||||
counter = [0 for _ in range(batch_size)]
|
||||
for batch_idx, target_idx in zip(batch_indices, target_indices):
|
||||
start_idx = start_indices[batch_idx]
|
||||
new_input_ids[batch_idx].append(input_ids[batch_idx][start_idx:target_idx])
|
||||
query_length = vision_query_lengths[batch_idx][counter[batch_idx]]
|
||||
new_input_ids[batch_idx].append(input_ids[batch_idx][target_idx].repeat(query_length))
|
||||
start_indices[batch_idx] = target_idx + 1
|
||||
counter[batch_idx] += 1
|
||||
|
||||
for batch_idx in range(batch_size):
|
||||
start_idx = start_indices[batch_idx]
|
||||
new_input_ids[batch_idx].append(input_ids[batch_idx][start_idx:]) # append remaining tokens
|
||||
new_input_ids[batch_idx] = torch.cat(new_input_ids[batch_idx], dim=0)
|
||||
|
||||
new_input_ids = torch.stack(new_input_ids)
|
||||
return new_input_ids
|
||||
|
||||
def _load_video_for_model(
|
||||
self,
|
||||
video: str,
|
||||
num_frames: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
backend: str = "opencv",
|
||||
**kwargs: Unpack[HCXProcessorKwargs],
|
||||
) -> List[ImageInput]:
|
||||
"""
|
||||
Overrided function.
|
||||
|
||||
Loads `video` to a List[PIL.Image] (llava style)
|
||||
|
||||
Args:
|
||||
video (`str`):
|
||||
The video to convert to the numpy array format. Can be a link to video or local path.
|
||||
num_frames (`int`, *optional*):
|
||||
Number of frames to sample uniformly. If not passed, the whole video is loaded.
|
||||
fps (`int`, *optional*):
|
||||
Number of frames to sample per second. Should be passed only when `num_frames=None`.
|
||||
If not specified and `num_frames==None`, all frames are sampled.
|
||||
backend (`str`, *optional*, defaults to `"opencv"`):
|
||||
The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
|
||||
|
||||
Returns:
|
||||
Tuple[`np.array`, Dict]: A tuple containing:
|
||||
- List[PIL.Image] of frames in RGB.
|
||||
- Metadata dictionary.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
HCXProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
logger.warning_once(f"num_frames control via argument is not supported yet. Ignored num_frames: {num_frames}.")
|
||||
logger.warning_once(f"fps control via argument is not supported yet. Ignored fps: {fps}.")
|
||||
logger.warning_once(f"backend control via argument is not supported yet. Ignored backend: {backend}.")
|
||||
|
||||
# video_loaded, video_metadata = load_video(
|
||||
# video, backend="decord", num_frames=32
|
||||
# )
|
||||
# frame_interval = int(video_metadata.total_num_frames / 32)
|
||||
# time_interval = frame_interval / video_metadata.fps
|
||||
# video_metadata.time_interval = time_interval
|
||||
|
||||
def _hcx_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None, **kwargs):
|
||||
max_num_grids = output_kwargs["videos_kwargs"]["max_num_grids"]
|
||||
max_image_cnt = output_kwargs["videos_kwargs"]["max_image_cnt"]
|
||||
frame_indices, time_interval = extract_frame_indices(
|
||||
metadata.duration,
|
||||
metadata.total_num_frames,
|
||||
metadata.fps,
|
||||
max_num_grids,
|
||||
max_image_cnt,
|
||||
default_interval=0.4,
|
||||
)
|
||||
metadata.time_interval = time_interval
|
||||
return np.array(frame_indices)
|
||||
|
||||
video_loaded, video_metadata = None, None
|
||||
for backend in ["decord", "pyav", "opencv", "torchvision"]:
|
||||
try:
|
||||
video_loaded, video_metadata = load_video(
|
||||
video, sample_indices_fn=_hcx_sample_indices_fn, backend=backend
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading video with {backend} backend: {e}")
|
||||
continue
|
||||
|
||||
assert video_loaded is not None, "Failed to load video with any backend"
|
||||
|
||||
return video_loaded, video_metadata
|
||||
|
||||
def _process_messages_for_chat_template(
|
||||
self,
|
||||
conversation: List[List[Dict[str, str]]],
|
||||
batch_images: List[List[ImageInput]],
|
||||
batch_videos: List[List[VideoInput]],
|
||||
batch_video_metadata: List[List[Dict[str, any]]],
|
||||
**mm_load_kwargs: Unpack[ChatTemplateLoadKwargs],
|
||||
):
|
||||
"""
|
||||
Overrided function.
|
||||
Used within `apply_chat_template` when a model has a special way to process conversation history. For example,
|
||||
video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
|
||||
were sampled. This information cannot be accessed before the video is loaded.
|
||||
|
||||
For most models it is a no-op, and must be overridden by model processors which require special processing.
|
||||
|
||||
Args:
|
||||
conversation (`List[Dict, str, str]`):
|
||||
The conversation to process. Always comes in batched format.
|
||||
batch_images (`List[List[ImageInput]]`):
|
||||
Batch of images that were loaded from url/path defined in the conversation. The images
|
||||
are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL` images
|
||||
per batch.
|
||||
batch_videos (`List[List[ImageInput]]`):
|
||||
Batch of videos that were loaded from url/path defined in the conversation. The videos
|
||||
are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL.Image`
|
||||
per batch.
|
||||
batch_video_metadata (`List[List[Dict[[str, any]]]]`):
|
||||
Batch of metadata returned from loading videos. That includes video fps, duration and total number of framer in original video.
|
||||
Metadata are ordered in the same way as `batch_videos`. Comes in nested list format, one list of `Dict`
|
||||
per batch.
|
||||
"""
|
||||
|
||||
is_video_in_conversation = False
|
||||
for batch_idx, messages in enumerate(conversation):
|
||||
is_video_in_messages = False
|
||||
is_image_in_messages = False
|
||||
for message in messages:
|
||||
for content in message["content"]:
|
||||
if content["type"] == "video":
|
||||
is_video_in_messages = True
|
||||
elif content["type"] == "image":
|
||||
is_image_in_messages = True
|
||||
if not is_video_in_messages:
|
||||
batch_videos.insert(batch_idx, [])
|
||||
batch_video_metadata.insert(batch_idx, [])
|
||||
if not is_image_in_messages:
|
||||
batch_images.insert(batch_idx, [])
|
||||
|
||||
is_video_in_conversation = is_video_in_conversation or is_video_in_messages
|
||||
|
||||
if not is_video_in_conversation:
|
||||
return conversation
|
||||
|
||||
# conversation processing
|
||||
new_conversation = []
|
||||
for batch_idx, messages in enumerate(conversation):
|
||||
video_counter = 0
|
||||
new_messages = []
|
||||
|
||||
for message in messages:
|
||||
new_message = {
|
||||
"role": message["role"],
|
||||
"content": [],
|
||||
}
|
||||
for content in message["content"]:
|
||||
if content["type"] == "video":
|
||||
video = batch_videos[batch_idx][video_counter]
|
||||
video_meta = batch_video_metadata[batch_idx][video_counter]
|
||||
|
||||
time_stamps = calc_timestamp_video_grids(video, video_meta.time_interval, max_grid_shape=(3, 3))
|
||||
video_counter += 1
|
||||
|
||||
if "filename" in content:
|
||||
filename = content["filename"]
|
||||
else:
|
||||
filename = content["video"].split("/")[-1]
|
||||
if len(filename) > 50:
|
||||
filename = f"{uuid.uuid4().hex}.mp4"
|
||||
basename, ext = os.path.splitext(filename)
|
||||
if ext == "":
|
||||
ext = ".mp4"
|
||||
|
||||
for frame_idx, time_stamp in enumerate(time_stamps):
|
||||
if frame_idx == len(video) - 1:
|
||||
# final_grid
|
||||
new_content = {
|
||||
"filename": f"{basename}-{frame_idx}{ext}",
|
||||
"video": content["video"],
|
||||
"type": "video",
|
||||
"video_time_stamp": time_stamp,
|
||||
"lens_keywords": content["lens_keywords"],
|
||||
"lens_local_keywords": content["lens_local_keywords"],
|
||||
"speech_to_text": content["speech_to_text"],
|
||||
"is_final_grid": True,
|
||||
}
|
||||
new_message["content"].append(new_content)
|
||||
else:
|
||||
new_content = {
|
||||
"filename": f"{basename}-{frame_idx}{ext}",
|
||||
"video": content["video"],
|
||||
"type": "video",
|
||||
"video_time_stamp": time_stamp,
|
||||
}
|
||||
new_message["content"].append(new_content)
|
||||
else:
|
||||
new_message["content"].append(copy.deepcopy(content))
|
||||
new_messages.append(new_message)
|
||||
new_conversation.append(new_messages)
|
||||
|
||||
return new_conversation
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: TextInput = None,
|
||||
images: List[List[ImageInput]] = None,
|
||||
videos: List[List[VideoInput]] = None,
|
||||
audio: AudioInput = None,
|
||||
**kwargs: Unpack[HCXProcessorKwargs],
|
||||
):
|
||||
output_kwargs = self._merge_kwargs(
|
||||
HCXProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# prepare model inputs
|
||||
mm_inputs = {
|
||||
"pixel_values_images": [],
|
||||
"image_sizes_images": [],
|
||||
"vision_query_lengths_images": [],
|
||||
"pixel_values_videos": [],
|
||||
# "image_sizes_videos": [],
|
||||
"vision_query_lengths_videos": [],
|
||||
}
|
||||
calc_non_vision_query_lengths = output_kwargs["text_kwargs"].pop("calc_non_vision_query_lengths")
|
||||
if calc_non_vision_query_lengths:
|
||||
mm_inputs["non_vision_query_lengths"] = []
|
||||
|
||||
# video processing
|
||||
if videos is not None:
|
||||
vit_input_size = self.image_processor.crop_size["width"]
|
||||
|
||||
video_kwargs = copy.deepcopy(output_kwargs["videos_kwargs"])
|
||||
|
||||
for videos_in_single_conversation in videos:
|
||||
pixel_values_videos = []
|
||||
vision_query_lengths_videos = []
|
||||
|
||||
for video_frames in videos_in_single_conversation:
|
||||
if len(video_frames) == 0:
|
||||
mm_inputs["pixel_values_videos"].append([])
|
||||
mm_inputs["vision_query_lengths_videos"].append([])
|
||||
continue
|
||||
video_frames_combined = combine_frames_into_images(
|
||||
video_frames, max_grid_shape=(3, 3), vit_input_size=vit_input_size
|
||||
)
|
||||
video_kwargs["is_video"] = True
|
||||
video_kwargs["return_tensors"] = None
|
||||
|
||||
frames_processed = self.image_processor(images=video_frames_combined, **video_kwargs)
|
||||
sizes = [(size["width"], size["height"]) for size in frames_processed["image_sizes"]]
|
||||
|
||||
pixel_values_videos.extend(frames_processed["pixel_values"])
|
||||
vision_query_lengths_videos.extend(frames_processed["vision_query_lengths"])
|
||||
|
||||
mm_inputs["pixel_values_videos"].append(pixel_values_videos)
|
||||
mm_inputs["vision_query_lengths_videos"].append(vision_query_lengths_videos)
|
||||
|
||||
# image processing
|
||||
if images is not None:
|
||||
image_kwargs = copy.deepcopy(output_kwargs["images_kwargs"])
|
||||
image_kwargs["is_video"] = False
|
||||
image_kwargs["return_tensors"] = None
|
||||
|
||||
for images_in_single_conversation in images:
|
||||
if isinstance(images_in_single_conversation, PIL.Image.Image): # single item to batch
|
||||
images_in_single_conversation = [images_in_single_conversation, ]
|
||||
if len(images_in_single_conversation) == 0:
|
||||
mm_inputs["pixel_values_images"].append([])
|
||||
mm_inputs["image_sizes_images"].append([])
|
||||
mm_inputs["vision_query_lengths_images"].append([])
|
||||
continue
|
||||
images_processed = self.image_processor(images=images_in_single_conversation, **image_kwargs)
|
||||
sizes = [(size["width"], size["height"]) for size in images_processed["image_sizes"]]
|
||||
|
||||
mm_inputs["pixel_values_images"].append(images_processed["pixel_values"])
|
||||
mm_inputs["image_sizes_images"].append(sizes)
|
||||
mm_inputs["vision_query_lengths_images"].append(images_processed["vision_query_lengths"])
|
||||
|
||||
# text processing
|
||||
def _create_replacer(_target_token, _replacements):
|
||||
_iterator = iter(_replacements)
|
||||
|
||||
def _replacer(match_obj):
|
||||
# return self.image_token
|
||||
num_query_tokens = next(_iterator)
|
||||
return "".join([_target_token for _ in range(num_query_tokens)])
|
||||
return _replacer
|
||||
|
||||
text_inputs = {}
|
||||
if text is not None:
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
if images is not None:
|
||||
new_texts = []
|
||||
for batch_idx, text_in_single_conversation in enumerate(text):
|
||||
new_text = self.image_token_pattern.sub(
|
||||
_create_replacer(self.image_token, mm_inputs["vision_query_lengths_images"][batch_idx]),
|
||||
text_in_single_conversation,
|
||||
)
|
||||
new_texts.append(new_text)
|
||||
text = new_texts
|
||||
|
||||
if videos is not None:
|
||||
new_texts = []
|
||||
for batch_idx, text_in_single_conversation in enumerate(text):
|
||||
new_text = self.video_token_pattern.sub(
|
||||
_create_replacer(self.video_token, mm_inputs["vision_query_lengths_videos"][batch_idx]),
|
||||
text_in_single_conversation,
|
||||
)
|
||||
new_texts.append(new_text)
|
||||
text = new_texts
|
||||
|
||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
|
||||
# audio processing
|
||||
if audio is not None:
|
||||
raise NotImplementedError("Audio processing is not supported yet.")
|
||||
|
||||
return HCXBatchFeature(data={**text_inputs, **mm_inputs})
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Siglip2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
|
||||
the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Siglip2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
|
||||
refer to the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def post_process_image_text_to_text(
|
||||
self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
|
||||
):
|
||||
"""
|
||||
Post-process the output of the model to decode the text.
|
||||
|
||||
Args:
|
||||
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
||||
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
|
||||
or `(sequence_length,)`.
|
||||
skip_special_tokens (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
|
||||
Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
|
||||
**kwargs:
|
||||
Additional arguments to be passed to the tokenizer's `batch_decode method`.
|
||||
|
||||
Returns:
|
||||
`List[str]`: The decoded text.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(
|
||||
generated_outputs,
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
return names_from_processor + []
|
||||
|
||||
|
||||
def extract_frame_indices(play_time, total_frames, fps, max_num_grids, max_image_cnt, default_interval=0.4):
|
||||
"""
|
||||
Extracts specific frame indices from a video based on duration, frame count, and sampling strategy.
|
||||
|
||||
The function determines which frames to extract given the video duration (`play_time`),
|
||||
total frame count, and frame rate. It samples frames at regular intervals (default: 0.4s),
|
||||
but if the number of frames exceeds the limit defined by `max_num_grids * max_image_cnt`,
|
||||
it performs uniform sampling to stay within that limit.
|
||||
|
||||
Args:
|
||||
play_time (float): Total play time of the video in seconds.
|
||||
total_frames (int): Total number of frames in the video.
|
||||
fps (float): Frames per second of the video.
|
||||
max_num_grids (int): Maximum number of grids to display.
|
||||
max_image_cnt (int): Maximum number of images per grid.
|
||||
default_interval (float, optional): Interval in seconds between frame samples. Defaults to 0.4.
|
||||
|
||||
Returns:
|
||||
Tuple:
|
||||
frame_indices (List[int]): A list of selected frame indices.
|
||||
time_interval (float): Time interval between selected frames (in seconds).
|
||||
"""
|
||||
|
||||
# Calculate how many frames to extract with the default interval
|
||||
default_frame_count = int(play_time / default_interval)
|
||||
|
||||
# Maximum frames allowed based on max_num_grids and max_image_cnt
|
||||
max_frames_allowed = max_num_grids * max_image_cnt
|
||||
|
||||
# Determine whether we can use the default interval or need uniform sampling
|
||||
if default_frame_count <= max_frames_allowed:
|
||||
# Default interval is sufficient, extract frames every 0.4 seconds
|
||||
frame_interval = int(total_frames / default_frame_count)
|
||||
else:
|
||||
# Use uniform sampling to fit within max_frames_allowed
|
||||
frame_interval = int(total_frames / max_frames_allowed)
|
||||
|
||||
# Extract frame indices at the calculated interval
|
||||
selected_indices = list(range(0, total_frames, frame_interval))
|
||||
|
||||
time_interval = frame_interval / fps
|
||||
|
||||
# Ensure the number of selected indices does not exceed max_frames_allowed
|
||||
return selected_indices[:max_frames_allowed], time_interval
|
||||
|
||||
|
||||
def calc_timestamp_video_grids(frames, time_interval, max_grid_shape=(3, 3)):
|
||||
"""
|
||||
Calculates the time range labels for each grid in a video.
|
||||
|
||||
Args:
|
||||
frames (List[PIL.Image.Image]): A list of frames extracted from a video.
|
||||
time_interval (float): Time interval (in seconds) between consecutive frames.
|
||||
max_grid_shape (Tuple[int, int], optional): The maximum grid shape as (rows, cols). Defaults to (3, 3).
|
||||
vit_input_size (int, optional): The target size (height and width) for the Vision Transformer input. Defaults to 378.
|
||||
|
||||
Returns:
|
||||
Tuple:
|
||||
image_time_stamps (List[str]): A list of time span labels for each combined image,
|
||||
e.g., ["0.00s~1.50s", "1.50s~3.00s", ...].
|
||||
"""
|
||||
max_num_grids = max_grid_shape[0] * max_grid_shape[1]
|
||||
# assert (
|
||||
# max_grid_shape[1] == 1
|
||||
# ), f"For video processing, decided to concatenate frames horizontally into a wide image."
|
||||
|
||||
# Calculate the number of canvases needed.
|
||||
num_frames = len(frames)
|
||||
num_canvases = num_frames // max_num_grids
|
||||
leftover_frames = num_frames % max_num_grids
|
||||
|
||||
time_stamp = 0 # second
|
||||
image_time_stamps = []
|
||||
|
||||
for canvas_idx in range(num_canvases):
|
||||
# Determine the frames to fill in the current canvas.
|
||||
start_idx = canvas_idx * max_num_grids
|
||||
end_idx = min(start_idx + max_num_grids, num_frames)
|
||||
|
||||
# Append the current canvas to the result list.
|
||||
frame_cnt = end_idx - start_idx
|
||||
image_time_stamps.append(f"{time_stamp:.2f}s~{time_stamp + frame_cnt * time_interval:.2f}s")
|
||||
time_stamp += frame_cnt * time_interval
|
||||
|
||||
if leftover_frames > 0:
|
||||
# Add the current canvas to the list of combined images.
|
||||
frame_cnt = leftover_frames
|
||||
image_time_stamps.append(f"{time_stamp:.2f}s~{time_stamp + frame_cnt * time_interval:.2f}s")
|
||||
time_stamp += frame_cnt * time_interval
|
||||
|
||||
return image_time_stamps
|
||||
|
||||
|
||||
def combine_frames_into_images(frames, max_grid_shape=(3, 3), vit_input_size=378):
|
||||
"""
|
||||
Combines a sequence of video frames into grid-based images and generates corresponding time range labels.
|
||||
|
||||
Frames are grouped and arranged into a grid (e.g., 3x3) such that each combined image contains up to
|
||||
`max_grid_shape[0] * max_grid_shape[1]` frames. Each combined image is resized to the given ViT input size.
|
||||
|
||||
Args:
|
||||
frames (NDArray): (num_frames, H, W, C) shape. A list of frames extracted from a video.
|
||||
time_interval (float): Time interval (in seconds) between consecutive frames.
|
||||
max_grid_shape (Tuple[int, int], optional): The maximum grid shape as (rows, cols). Defaults to (3, 3).
|
||||
vit_input_size (int, optional): The target size (height and width) for the Vision Transformer input. Defaults to 378.
|
||||
|
||||
Returns:
|
||||
Tuple:
|
||||
image_list (List[PIL.Image.Image]): A list of grid-combined images.
|
||||
"""
|
||||
max_num_grids = max_grid_shape[0] * max_grid_shape[1]
|
||||
# assert (
|
||||
# max_grid_shape[1] == 1
|
||||
# ), f"For video processing, decided to concatenate frames horizontally into a wide image."
|
||||
|
||||
# List to store the resulting combined images.
|
||||
image_list = []
|
||||
|
||||
# Calculate the number of canvases needed.
|
||||
num_frames = len(frames)
|
||||
num_canvases = num_frames // max_num_grids
|
||||
leftover_frames = num_frames % max_num_grids
|
||||
|
||||
# change frames (4d numpy tensor) to List[PIL.Image.Image]
|
||||
frames = [Image.fromarray(frame) for frame in frames]
|
||||
|
||||
for canvas_idx in range(num_canvases):
|
||||
# Initialize the current canvas.
|
||||
combined_image = Image.new(
|
||||
"RGB", (vit_input_size * max_grid_shape[0], vit_input_size * max_grid_shape[1]), color=(0, 0, 0)
|
||||
)
|
||||
|
||||
# Determine the frames to fill in the current canvas.
|
||||
start_idx = canvas_idx * max_num_grids
|
||||
end_idx = min(start_idx + max_num_grids, num_frames)
|
||||
|
||||
for idx in range(start_idx, end_idx):
|
||||
img = frames[idx]
|
||||
|
||||
# Resize each frame to a square shape.
|
||||
img_resized = img.resize((vit_input_size, vit_input_size))
|
||||
|
||||
# Calculate the (row, column) position to place the frame within the grid layout.
|
||||
local_idx = idx - start_idx
|
||||
x_offset = (local_idx % max_grid_shape[0]) * vit_input_size
|
||||
y_offset = (local_idx // max_grid_shape[0]) * vit_input_size
|
||||
|
||||
# Calculate the position to place the frame in the grid.
|
||||
combined_image.paste(img_resized, (x_offset, y_offset))
|
||||
|
||||
# Append the current canvas to the result list.
|
||||
image_list.append(combined_image)
|
||||
|
||||
if leftover_frames > 0:
|
||||
# canvas_idx might be undefined; default to 0 if not previously assigned to avoid "referenced before assignment" error.
|
||||
canvas_idx = num_canvases
|
||||
# Add the remaining frames to the final canvas.
|
||||
# combined_image = Image.new("RGB", (vit_input_size * leftover_frames, vit_input_size * 1), color=(0, 0, 0)) # hsk
|
||||
combined_image = Image.new(
|
||||
"RGB", (vit_input_size * max_grid_shape[0], vit_input_size * max_grid_shape[1]), color=(0, 0, 0)
|
||||
)
|
||||
|
||||
for idx in range(leftover_frames):
|
||||
img = frames[num_canvases * max_num_grids + idx]
|
||||
|
||||
# Resize the frame to a square (equal width and height).
|
||||
img_resized = img.resize((vit_input_size, vit_input_size))
|
||||
|
||||
# Calculate the (row, column) position to place the frame within the grid layout.
|
||||
# x_offset = (idx % leftover_frames) * vit_input_size # hsk
|
||||
# y_offset = (idx // leftover_frames) * vit_input_size # hsk
|
||||
x_offset = (idx % max_grid_shape[0]) * vit_input_size
|
||||
y_offset = (idx // max_grid_shape[0]) * vit_input_size
|
||||
|
||||
# Calculate the position to place the frame within the grid layout.
|
||||
combined_image.paste(img_resized, (x_offset, y_offset))
|
||||
|
||||
# Add the current canvas to the list of combined images.
|
||||
image_list.append(combined_image)
|
||||
|
||||
return image_list
|
||||
Reference in New Issue
Block a user