837 lines
29 KiB
Python
837 lines
29 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
|||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|||
|
|
import math
|
|||
|
|
from abc import abstractmethod
|
|||
|
|
from io import BytesIO
|
|||
|
|
from typing import TYPE_CHECKING, Any, cast
|
|||
|
|
|
|||
|
|
import numpy as np
|
|||
|
|
import numpy.typing as npt
|
|||
|
|
|
|||
|
|
if TYPE_CHECKING:
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
from vllm.logger import init_logger
|
|||
|
|
from vllm.utils.registry import ExtensionManager
|
|||
|
|
|
|||
|
|
logger = init_logger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
|
|||
|
|
num_frames, _, _, channels = frames.shape
|
|||
|
|
new_height, new_width = size
|
|||
|
|
resized_frames = np.empty(
|
|||
|
|
(num_frames, new_height, new_width, channels), dtype=frames.dtype
|
|||
|
|
)
|
|||
|
|
# lazy import cv2 to avoid bothering users who only use text models
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
for i, frame in enumerate(frames):
|
|||
|
|
resized_frame = cv2.resize(frame, (new_width, new_height))
|
|||
|
|
resized_frames[i] = resized_frame
|
|||
|
|
return resized_frames
|
|||
|
|
|
|||
|
|
|
|||
|
|
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
|
|||
|
|
_, height, width, _ = frames.shape
|
|||
|
|
new_height = int(height * size_factor)
|
|||
|
|
new_width = int(width * size_factor)
|
|||
|
|
|
|||
|
|
return resize_video(frames, (new_height, new_width))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArray:
|
|||
|
|
total_frames = frames.shape[0]
|
|||
|
|
if num_frames == -1:
|
|||
|
|
return frames
|
|||
|
|
|
|||
|
|
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
|||
|
|
sampled_frames = frames[frame_indices, ...]
|
|||
|
|
return sampled_frames
|
|||
|
|
|
|||
|
|
|
|||
|
|
class VideoLoader:
|
|||
|
|
@classmethod
|
|||
|
|
@abstractmethod
|
|||
|
|
def load_bytes(
|
|||
|
|
cls, data: bytes, num_frames: int = -1, **kwargs
|
|||
|
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
|||
|
|
raise NotImplementedError
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _can_use_for_recovery(
|
|||
|
|
idx: int,
|
|||
|
|
failed_frames: list[int],
|
|||
|
|
next_target_map: dict[int, int],
|
|||
|
|
total_frames: int,
|
|||
|
|
) -> bool:
|
|||
|
|
"""Check if current frame can recover the oldest failed frame."""
|
|||
|
|
if not failed_frames:
|
|||
|
|
return False
|
|||
|
|
oldest_failed = failed_frames[0]
|
|||
|
|
limit = next_target_map.get(oldest_failed, total_frames)
|
|||
|
|
return idx < limit
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _read_frames_with_recovery(
|
|||
|
|
cap: "cv2.VideoCapture",
|
|||
|
|
frame_indices: list[int],
|
|||
|
|
total_frames: int,
|
|||
|
|
) -> tuple[npt.NDArray, list[int], dict[int, int]]:
|
|||
|
|
"""
|
|||
|
|
Read frames with dynamic window forward-scan recovery.
|
|||
|
|
|
|||
|
|
When a target frame fails to load, the next successfully grabbed
|
|||
|
|
frame (before the next target frame) will be used to recover it.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
cap: OpenCV VideoCapture object
|
|||
|
|
frame_indices: Sorted list of target frame indices to load
|
|||
|
|
total_frames: Total number of frames in the video
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Tuple of (frames_array, valid_frame_indices, recovered_map)
|
|||
|
|
- frames_array: Array of loaded frames
|
|||
|
|
- valid_frame_indices: List of frame indices that were loaded
|
|||
|
|
- recovered_map: Dict mapping recovered_idx -> source_idx
|
|||
|
|
"""
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|||
|
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|||
|
|
|
|||
|
|
assert width > 0 and height > 0, (
|
|||
|
|
f"Invalid video frame size: width={width}, height={height}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
frame_idx_set = set(frame_indices)
|
|||
|
|
max_frame_idx = frame_indices[-1] if frame_indices else 0
|
|||
|
|
|
|||
|
|
# Build map: target_idx -> next_target_idx (for recovery window)
|
|||
|
|
next_target_map: dict[int, int] = {}
|
|||
|
|
for k in range(len(frame_indices) - 1):
|
|||
|
|
next_target_map[frame_indices[k]] = frame_indices[k + 1]
|
|||
|
|
next_target_map[frame_indices[-1]] = total_frames
|
|||
|
|
|
|||
|
|
frames_list: list[npt.NDArray] = []
|
|||
|
|
valid_frame_indices: list[int] = []
|
|||
|
|
failed_frames_idx: list[int] = []
|
|||
|
|
recovered_map: dict[int, int] = {}
|
|||
|
|
|
|||
|
|
i = 0
|
|||
|
|
for idx in range(max_frame_idx + 1):
|
|||
|
|
is_target_frame = idx in frame_idx_set
|
|||
|
|
|
|||
|
|
# Attempt to grab the current frame
|
|||
|
|
ok = cap.grab()
|
|||
|
|
|
|||
|
|
if not ok:
|
|||
|
|
if is_target_frame:
|
|||
|
|
logger.warning(
|
|||
|
|
"Failed to grab frame %d during video loading.",
|
|||
|
|
idx,
|
|||
|
|
)
|
|||
|
|
failed_frames_idx.append(idx)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Check if we should retrieve: target frame OR can recover a failed one
|
|||
|
|
can_recover = VideoLoader._can_use_for_recovery(
|
|||
|
|
idx, failed_frames_idx, next_target_map, total_frames
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if is_target_frame or can_recover:
|
|||
|
|
ret, frame = cap.retrieve()
|
|||
|
|
|
|||
|
|
if ret and frame is not None and frame.size > 0:
|
|||
|
|
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|||
|
|
frames_list.append(rgb_frame)
|
|||
|
|
valid_frame_indices.append(idx)
|
|||
|
|
i += 1
|
|||
|
|
|
|||
|
|
if can_recover:
|
|||
|
|
recovered_idx = failed_frames_idx.pop(0)
|
|||
|
|
recovered_map[recovered_idx] = idx
|
|||
|
|
logger.info(
|
|||
|
|
"Recovered frame %d using frame %d (delay: %d)",
|
|||
|
|
recovered_idx,
|
|||
|
|
idx,
|
|||
|
|
idx - recovered_idx,
|
|||
|
|
)
|
|||
|
|
elif is_target_frame:
|
|||
|
|
logger.warning(
|
|||
|
|
"Failed to retrieve frame %d during video loading.",
|
|||
|
|
idx,
|
|||
|
|
)
|
|||
|
|
failed_frames_idx.append(idx)
|
|||
|
|
|
|||
|
|
# Log any remaining failed frames
|
|||
|
|
for failed_idx in failed_frames_idx:
|
|||
|
|
logger.warning(
|
|||
|
|
"Frame %d could not be recovered (end of video).",
|
|||
|
|
failed_idx,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Stack frames
|
|||
|
|
if frames_list:
|
|||
|
|
frames = np.stack(frames_list)
|
|||
|
|
else:
|
|||
|
|
frames = np.empty((0, height, width, 3), dtype=np.uint8)
|
|||
|
|
|
|||
|
|
return frames, valid_frame_indices, recovered_map
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _read_frames(
|
|||
|
|
cap,
|
|||
|
|
frame_indices: set[int],
|
|||
|
|
num_expected_frames: int,
|
|||
|
|
max_frame_idx: int,
|
|||
|
|
) -> tuple[npt.NDArray, int, list[int]]:
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|||
|
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|||
|
|
frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
|
|||
|
|
|
|||
|
|
i = 0
|
|||
|
|
valid_frame_indices = []
|
|||
|
|
for idx in range(max_frame_idx + 1):
|
|||
|
|
ok = cap.grab()
|
|||
|
|
if not ok:
|
|||
|
|
# Frame is broken/unreadable, log warning
|
|||
|
|
if idx in frame_indices:
|
|||
|
|
logger.warning(
|
|||
|
|
"Failed to grab frame %d during video loading. "
|
|||
|
|
"This frame will be skipped.",
|
|||
|
|
idx,
|
|||
|
|
)
|
|||
|
|
continue
|
|||
|
|
if idx in frame_indices:
|
|||
|
|
ret, frame = cap.retrieve()
|
|||
|
|
if ret:
|
|||
|
|
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|||
|
|
valid_frame_indices.append(idx)
|
|||
|
|
i += 1
|
|||
|
|
else:
|
|||
|
|
# retrieve() failed even though grab() succeeded
|
|||
|
|
logger.warning(
|
|||
|
|
"Failed to retrieve frame %d during video loading. "
|
|||
|
|
"This frame will be skipped.",
|
|||
|
|
idx,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
valid_num_frames = len(valid_frame_indices)
|
|||
|
|
if valid_num_frames < num_expected_frames:
|
|||
|
|
logger.warning(
|
|||
|
|
"Video loading completed with %d broken/unreadable frames. "
|
|||
|
|
"Expected %d frames but only loaded %d frames.",
|
|||
|
|
num_expected_frames - valid_num_frames,
|
|||
|
|
num_expected_frames,
|
|||
|
|
valid_num_frames,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
|
|||
|
|
|
|||
|
|
|
|||
|
|
VIDEO_LOADER_REGISTRY = ExtensionManager()
|
|||
|
|
|
|||
|
|
|
|||
|
|
@VIDEO_LOADER_REGISTRY.register("opencv")
|
|||
|
|
class OpenCVVideoBackend(VideoLoader):
|
|||
|
|
def get_cv2_video_api(self):
|
|||
|
|
import cv2.videoio_registry as vr
|
|||
|
|
|
|||
|
|
api_pref = None
|
|||
|
|
for backend in vr.getStreamBufferedBackends():
|
|||
|
|
if not vr.hasBackend(backend):
|
|||
|
|
continue
|
|||
|
|
if not vr.isBackendBuiltIn(backend):
|
|||
|
|
_, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
|
|||
|
|
if abi < 1 or (abi == 1 and api < 2):
|
|||
|
|
continue
|
|||
|
|
api_pref = backend
|
|||
|
|
break
|
|||
|
|
return api_pref
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def load_bytes(
|
|||
|
|
cls,
|
|||
|
|
data: bytes,
|
|||
|
|
num_frames: int = -1,
|
|||
|
|
fps: int = -1,
|
|||
|
|
max_duration: int = 300,
|
|||
|
|
frame_recovery: bool = False,
|
|||
|
|
**kwargs,
|
|||
|
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
|||
|
|
"""
|
|||
|
|
Load video frames from bytes.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
data: Raw video bytes
|
|||
|
|
num_frames: Target number of frames to sample (-1 for all)
|
|||
|
|
fps: Target FPS for sampling (-1 for original)
|
|||
|
|
max_duration: Maximum duration (unused in base backend)
|
|||
|
|
frame_recovery: Enable forward-scan recovery for failed frames
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Tuple of (frames_array, metadata_dict)
|
|||
|
|
"""
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
backend = cls().get_cv2_video_api()
|
|||
|
|
cap = cv2.VideoCapture(BytesIO(data), backend, [])
|
|||
|
|
if not cap.isOpened():
|
|||
|
|
raise ValueError("Could not open video stream")
|
|||
|
|
|
|||
|
|
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|||
|
|
original_fps = cap.get(cv2.CAP_PROP_FPS)
|
|||
|
|
duration = total_frames_num / original_fps if original_fps > 0 else 0
|
|||
|
|
|
|||
|
|
# resample video to target num_frames and fps
|
|||
|
|
# - the minimum of the two will be used
|
|||
|
|
num_frames_to_sample = total_frames_num
|
|||
|
|
if num_frames > 0:
|
|||
|
|
num_frames_to_sample = min(num_frames, total_frames_num)
|
|||
|
|
if fps > 0:
|
|||
|
|
num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
|
|||
|
|
num_frames_to_sample = max(1, num_frames_to_sample) # at least one sample
|
|||
|
|
|
|||
|
|
if num_frames_to_sample == total_frames_num:
|
|||
|
|
frame_idx = list(range(0, num_frames_to_sample))
|
|||
|
|
else:
|
|||
|
|
uniform_sampled_frames = np.linspace(
|
|||
|
|
0, total_frames_num - 1, num_frames_to_sample, dtype=int
|
|||
|
|
)
|
|||
|
|
frame_idx = uniform_sampled_frames.tolist()
|
|||
|
|
|
|||
|
|
if frame_recovery:
|
|||
|
|
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
|
|||
|
|
cap, frame_idx, total_frames_num
|
|||
|
|
)
|
|||
|
|
valid_num_frames = len(valid_frame_indices)
|
|||
|
|
|
|||
|
|
if recovered_map:
|
|||
|
|
logger.info(
|
|||
|
|
"Frame recovery: %d frames recovered using forward scan.",
|
|||
|
|
len(recovered_map),
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
frame_idx_set = set(frame_idx)
|
|||
|
|
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
|
|||
|
|
cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Use transformers transformers.video_utils.VideoMetadata format
|
|||
|
|
# NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
|
|||
|
|
# can cause incorrect timestamp calculation without num_frames=-1.
|
|||
|
|
metadata = {
|
|||
|
|
"total_num_frames": total_frames_num,
|
|||
|
|
"fps": original_fps,
|
|||
|
|
"duration": duration,
|
|||
|
|
"video_backend": "opencv",
|
|||
|
|
"frames_indices": valid_frame_indices,
|
|||
|
|
# extra field used to control hf processor's video
|
|||
|
|
# sampling behavior
|
|||
|
|
"do_sample_frames": valid_num_frames == total_frames_num,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return frames, metadata
|
|||
|
|
|
|||
|
|
|
|||
|
|
@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
|
|||
|
|
class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
|
|||
|
|
@classmethod
|
|||
|
|
def load_bytes(
|
|||
|
|
cls,
|
|||
|
|
data: bytes,
|
|||
|
|
num_frames: int = -1,
|
|||
|
|
fps: int = 2,
|
|||
|
|
max_duration: int = 300,
|
|||
|
|
frame_recovery: bool = False,
|
|||
|
|
**kwargs,
|
|||
|
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
|||
|
|
"""
|
|||
|
|
Load video frames with dynamic sampling based on duration.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
data: Raw video bytes
|
|||
|
|
num_frames: Not used in dynamic backend
|
|||
|
|
fps: Target FPS for sampling (default: 2)
|
|||
|
|
max_duration: Maximum video duration to process (default: 300s)
|
|||
|
|
frame_recovery: Enable forward-scan recovery for failed frames
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Tuple of (frames_array, metadata_dict)
|
|||
|
|
"""
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
backend = cls().get_cv2_video_api()
|
|||
|
|
cap = cv2.VideoCapture(BytesIO(data), backend, [])
|
|||
|
|
if not cap.isOpened():
|
|||
|
|
raise ValueError("Could not open video stream")
|
|||
|
|
|
|||
|
|
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|||
|
|
original_fps = cap.get(cv2.CAP_PROP_FPS)
|
|||
|
|
duration = total_frames_num / original_fps if original_fps > 0 else 0
|
|||
|
|
|
|||
|
|
# resample video to target num_frames
|
|||
|
|
max_frame_idx = total_frames_num - 1
|
|||
|
|
duration = duration or round(max_frame_idx / original_fps) + 1
|
|||
|
|
|
|||
|
|
# Refer to:
|
|||
|
|
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
|
|||
|
|
frame_indices_list: list[int]
|
|||
|
|
if duration <= max_duration:
|
|||
|
|
n = int(math.floor(duration * fps))
|
|||
|
|
frame_indices_list = sorted(
|
|||
|
|
{
|
|||
|
|
min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
|
|||
|
|
for i in range(n)
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
num_samples = int(max_duration * fps)
|
|||
|
|
if num_samples >= total_frames_num:
|
|||
|
|
frame_indices_list = list(range(total_frames_num))
|
|||
|
|
else:
|
|||
|
|
target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
|
|||
|
|
frame_indices_list = sorted(
|
|||
|
|
{
|
|||
|
|
min(max_frame_idx, int(math.ceil(t * original_fps)))
|
|||
|
|
for t in target_seconds
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if frame_recovery:
|
|||
|
|
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
|
|||
|
|
cap, frame_indices_list, total_frames_num
|
|||
|
|
)
|
|||
|
|
valid_num_frames = len(valid_frame_indices)
|
|||
|
|
|
|||
|
|
if recovered_map:
|
|||
|
|
logger.info(
|
|||
|
|
"Frame recovery: %d frames recovered using forward scan.",
|
|||
|
|
len(recovered_map),
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
frame_indices_set = set(frame_indices_list)
|
|||
|
|
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
|
|||
|
|
cap, frame_indices_set, len(frame_indices_list), total_frames_num - 1
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Use transformers transformers.video_utils.VideoMetadata format
|
|||
|
|
metadata = {
|
|||
|
|
"total_num_frames": total_frames_num,
|
|||
|
|
"fps": original_fps,
|
|||
|
|
"duration": duration,
|
|||
|
|
"video_backend": "opencv_dynamic",
|
|||
|
|
"frames_indices": valid_frame_indices,
|
|||
|
|
"do_sample_frames": False,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return frames, metadata
|
|||
|
|
|
|||
|
|
|
|||
|
|
@VIDEO_LOADER_REGISTRY.register("molmo2")
|
|||
|
|
class Molmo2VideoBackend(VideoLoader):
|
|||
|
|
def get_cv2_video_api(self):
|
|||
|
|
import cv2.videoio_registry as vr
|
|||
|
|
|
|||
|
|
api_pref = None
|
|||
|
|
for backend in vr.getStreamBufferedBackends():
|
|||
|
|
if not vr.hasBackend(backend):
|
|||
|
|
continue
|
|||
|
|
if not vr.isBackendBuiltIn(backend):
|
|||
|
|
_, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
|
|||
|
|
if abi < 1 or (abi == 1 and api < 2):
|
|||
|
|
continue
|
|||
|
|
api_pref = backend
|
|||
|
|
break
|
|||
|
|
return api_pref
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def get_candidate_target_fps(
|
|||
|
|
cls,
|
|||
|
|
video_fps: float,
|
|||
|
|
sampling_fps: float,
|
|||
|
|
max_fps: float = 8.0,
|
|||
|
|
) -> list[float]:
|
|||
|
|
"""
|
|||
|
|
Return the subset of `video_fps` factors that remain multiples
|
|||
|
|
of `sampling_fps`.
|
|||
|
|
|
|||
|
|
Examples:
|
|||
|
|
>>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
|
|||
|
|
[2, 6]
|
|||
|
|
>>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
|
|||
|
|
[1, 5]
|
|||
|
|
>>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
|
|||
|
|
[2]
|
|||
|
|
>>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
|
|||
|
|
Traceback (most recent call last):
|
|||
|
|
...
|
|||
|
|
ValueError: sampling_fps=2 must divide video_fps=5 to produce
|
|||
|
|
consistent frame steps.
|
|||
|
|
"""
|
|||
|
|
video_fps = int(video_fps)
|
|||
|
|
sampling_fps = int(sampling_fps)
|
|||
|
|
max_fps = int(max_fps)
|
|||
|
|
|
|||
|
|
if sampling_fps is None:
|
|||
|
|
raise ValueError("sampling_fps must be provided")
|
|||
|
|
if video_fps <= 0 or sampling_fps <= 0:
|
|||
|
|
raise ValueError(
|
|||
|
|
"video_fps and sampling_fps must be positive "
|
|||
|
|
f"(got {video_fps}, {sampling_fps})"
|
|||
|
|
)
|
|||
|
|
if video_fps % sampling_fps != 0:
|
|||
|
|
raise ValueError(
|
|||
|
|
f"sampling_fps={sampling_fps} must divide video_fps={video_fps}."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
candidates = []
|
|||
|
|
for candidate in range(sampling_fps, video_fps + 1, sampling_fps):
|
|||
|
|
if candidate > max_fps:
|
|||
|
|
break
|
|||
|
|
if video_fps % candidate == 0:
|
|||
|
|
candidates.append(float(candidate))
|
|||
|
|
|
|||
|
|
return candidates
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def get_target_fps(
|
|||
|
|
cls,
|
|||
|
|
video_fps: float,
|
|||
|
|
max_frames: int,
|
|||
|
|
total_frames: int,
|
|||
|
|
frame_sample_mode: str,
|
|||
|
|
candidate_target_fps: list[float],
|
|||
|
|
) -> float | None:
|
|||
|
|
"""
|
|||
|
|
Get the target fps that best spans the videoand has the most frames sampled
|
|||
|
|
"""
|
|||
|
|
num_frames_sampled = 0
|
|||
|
|
selected_target_fps = None
|
|||
|
|
for target_fps in candidate_target_fps:
|
|||
|
|
step_size = max(int(video_fps / target_fps), 1)
|
|||
|
|
num_frames_sampled_at_fps = int(total_frames / step_size)
|
|||
|
|
if num_frames_sampled == 0:
|
|||
|
|
if (
|
|||
|
|
"uniform" in frame_sample_mode
|
|||
|
|
and num_frames_sampled_at_fps > max_frames
|
|||
|
|
):
|
|||
|
|
break
|
|||
|
|
selected_target_fps = target_fps
|
|||
|
|
num_frames_sampled = num_frames_sampled_at_fps
|
|||
|
|
|
|||
|
|
else:
|
|||
|
|
# the candidate sampling fps increases so frame count can't decrease
|
|||
|
|
assert num_frames_sampled <= num_frames_sampled_at_fps
|
|||
|
|
if num_frames_sampled_at_fps > max_frames:
|
|||
|
|
# choose the sampling fps that spans the video
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
elif num_frames_sampled_at_fps > num_frames_sampled:
|
|||
|
|
# both are less than max_frames; choose the one with higher
|
|||
|
|
# density of frames sampled
|
|||
|
|
selected_target_fps = target_fps
|
|||
|
|
num_frames_sampled = num_frames_sampled_at_fps
|
|||
|
|
return selected_target_fps
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def get_frame_times_and_chosen_fps(
|
|||
|
|
cls,
|
|||
|
|
selected_target_fps: float | None,
|
|||
|
|
total_frames: int,
|
|||
|
|
max_frames: int,
|
|||
|
|
video_fps: float,
|
|||
|
|
) -> tuple[float | None, npt.NDArray]:
|
|||
|
|
if selected_target_fps is None:
|
|||
|
|
frame_indices = np.linspace(
|
|||
|
|
0, total_frames, max_frames, endpoint=False, dtype=int
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
step_size = max(int(video_fps / selected_target_fps), 1)
|
|||
|
|
frame_indices = np.arange(0, total_frames, step_size)
|
|||
|
|
if len(frame_indices) > max_frames:
|
|||
|
|
frame_indices = frame_indices[:max_frames]
|
|||
|
|
return selected_target_fps, frame_indices
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def sample_times(
|
|||
|
|
cls,
|
|||
|
|
duration: float,
|
|||
|
|
max_frames: int,
|
|||
|
|
frame_sample_mode: str,
|
|||
|
|
max_fps: int | None,
|
|||
|
|
candidate_target_fps: list[float] | None = None,
|
|||
|
|
**kwargs,
|
|||
|
|
) -> npt.NDArray:
|
|||
|
|
if frame_sample_mode == "fps":
|
|||
|
|
assert candidate_target_fps is not None
|
|||
|
|
# Try larger and larger FPSs until we hit one that can't span the video
|
|||
|
|
sampling_fps = candidate_target_fps[0]
|
|||
|
|
for candidate_fps in candidate_target_fps[1:]:
|
|||
|
|
if max_frames / candidate_fps < duration:
|
|||
|
|
break
|
|||
|
|
sampling_fps = candidate_fps
|
|||
|
|
times = np.arange(0, max_frames) / sampling_fps
|
|||
|
|
times = times[times < duration]
|
|||
|
|
return times
|
|||
|
|
elif frame_sample_mode == "uniform_last_frame":
|
|||
|
|
if max_fps is not None:
|
|||
|
|
max_duration = (
|
|||
|
|
max_frames - 1
|
|||
|
|
) / max_fps # -1 to include the last frame
|
|||
|
|
if max_duration < duration:
|
|||
|
|
times = np.linspace(
|
|||
|
|
0, duration, num=max_frames, endpoint=True, dtype=np.float64
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
times = np.arange(0.0, stop=duration, step=1 / max_fps)
|
|||
|
|
times = np.concatenate([times, [duration]], axis=0)
|
|||
|
|
assert len(times) <= max_frames
|
|||
|
|
else:
|
|||
|
|
times = np.linspace(
|
|||
|
|
0, duration, num=max_frames, endpoint=True, dtype=np.float64
|
|||
|
|
)
|
|||
|
|
return times
|
|||
|
|
else:
|
|||
|
|
raise NotImplementedError(frame_sample_mode)
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def _sample_frames(
|
|||
|
|
cls,
|
|||
|
|
total_num_frames: int,
|
|||
|
|
video_fps: float,
|
|||
|
|
duration: float,
|
|||
|
|
frame_sample_mode: str,
|
|||
|
|
num_frames: int,
|
|||
|
|
max_fps: int,
|
|||
|
|
sampling_fps: int,
|
|||
|
|
) -> npt.NDArray:
|
|||
|
|
if frame_sample_mode == "uniform_last_frame" and max_fps is not None:
|
|||
|
|
if total_num_frames <= 2:
|
|||
|
|
indices = np.arange(total_num_frames).astype(int)
|
|||
|
|
elif duration > (num_frames - 1) / max_fps: # -1 to include the last frame
|
|||
|
|
# uniform fallback
|
|||
|
|
indices = np.linspace(
|
|||
|
|
0,
|
|||
|
|
total_num_frames - 1,
|
|||
|
|
num=min(num_frames, total_num_frames),
|
|||
|
|
endpoint=True,
|
|||
|
|
).astype(int)
|
|||
|
|
else:
|
|||
|
|
float_indices = np.arange(
|
|||
|
|
0.0,
|
|||
|
|
stop=total_num_frames - 1,
|
|||
|
|
step=float(video_fps / max_fps),
|
|||
|
|
)
|
|||
|
|
if np.round(float_indices[-1]) != total_num_frames - 1:
|
|||
|
|
float_indices = np.concatenate(
|
|||
|
|
[float_indices, [total_num_frames - 1]], axis=0
|
|||
|
|
)
|
|||
|
|
indices = np.round(float_indices).astype(int)
|
|||
|
|
assert indices[-1] < total_num_frames
|
|||
|
|
assert len(float_indices) <= num_frames
|
|||
|
|
elif frame_sample_mode == "uniform_last_frame":
|
|||
|
|
indices = np.linspace(
|
|||
|
|
0,
|
|||
|
|
total_num_frames - 1,
|
|||
|
|
num=min(num_frames, total_num_frames),
|
|||
|
|
endpoint=True,
|
|||
|
|
).astype(int)
|
|||
|
|
elif frame_sample_mode == "fps":
|
|||
|
|
candidate_target_fps = cls.get_candidate_target_fps(video_fps, sampling_fps)
|
|||
|
|
selected_target_fps = cls.get_target_fps(
|
|||
|
|
video_fps,
|
|||
|
|
num_frames,
|
|||
|
|
total_num_frames,
|
|||
|
|
frame_sample_mode,
|
|||
|
|
candidate_target_fps,
|
|||
|
|
)
|
|||
|
|
_, indices = cls.get_frame_times_and_chosen_fps(
|
|||
|
|
selected_target_fps,
|
|||
|
|
total_num_frames,
|
|||
|
|
num_frames,
|
|||
|
|
video_fps,
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
raise NotImplementedError(frame_sample_mode)
|
|||
|
|
|
|||
|
|
return indices
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def load_bytes_opencv(
|
|||
|
|
cls,
|
|||
|
|
data: bytes,
|
|||
|
|
frame_sample_mode: str | None = None,
|
|||
|
|
num_frames: int = -1,
|
|||
|
|
max_fps: int = 2,
|
|||
|
|
sampling_fps: int = 2,
|
|||
|
|
**kwargs,
|
|||
|
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
backend = cls().get_cv2_video_api()
|
|||
|
|
cap = cv2.VideoCapture(BytesIO(data), backend, [])
|
|||
|
|
if not cap.isOpened():
|
|||
|
|
raise ValueError("Could not open video stream")
|
|||
|
|
|
|||
|
|
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|||
|
|
original_fps = cap.get(cv2.CAP_PROP_FPS)
|
|||
|
|
duration = total_frames_num / original_fps if original_fps > 0 else 0
|
|||
|
|
|
|||
|
|
if frame_sample_mode is None:
|
|||
|
|
# Use transformers transformers.video_utils.VideoMetadata format
|
|||
|
|
frame_idx = list(range(0, total_frames_num))
|
|||
|
|
frame_idx_set = set(frame_idx)
|
|||
|
|
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
|
|||
|
|
cap, frame_idx_set, total_frames_num, max(frame_idx)
|
|||
|
|
)
|
|||
|
|
do_sample_frames = valid_num_frames == total_frames_num
|
|||
|
|
metadata = {
|
|||
|
|
"total_num_frames": total_frames_num,
|
|||
|
|
"fps": original_fps,
|
|||
|
|
"duration": duration,
|
|||
|
|
"video_backend": "opencv",
|
|||
|
|
"do_sample_frames": do_sample_frames,
|
|||
|
|
}
|
|||
|
|
if not do_sample_frames:
|
|||
|
|
metadata["frames_indices"] = valid_frame_indices
|
|||
|
|
return frames, metadata
|
|||
|
|
|
|||
|
|
frame_idx = cls._sample_frames(
|
|||
|
|
total_frames_num,
|
|||
|
|
original_fps,
|
|||
|
|
duration,
|
|||
|
|
frame_sample_mode,
|
|||
|
|
num_frames,
|
|||
|
|
max_fps,
|
|||
|
|
sampling_fps,
|
|||
|
|
).tolist()
|
|||
|
|
|
|||
|
|
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
|
|||
|
|
cap,
|
|||
|
|
set(frame_idx),
|
|||
|
|
len(frame_idx),
|
|||
|
|
total_frames_num - 1,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
metadata = {
|
|||
|
|
"total_num_frames": total_frames_num,
|
|||
|
|
"fps": original_fps,
|
|||
|
|
"duration": duration,
|
|||
|
|
"video_backend": "opencv",
|
|||
|
|
"frames_indices": valid_frame_indices,
|
|||
|
|
"do_sample_frames": False,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return frames, metadata
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def load_bytes(
|
|||
|
|
cls,
|
|||
|
|
data: bytes,
|
|||
|
|
num_frames: int = -1,
|
|||
|
|
**kwargs,
|
|||
|
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
|||
|
|
frame_sample_mode = cast(str | None, kwargs.pop("frame_sample_mode", None))
|
|||
|
|
max_fps = cast(int, kwargs.pop("max_fps", 2))
|
|||
|
|
sampling_fps = cast(int, kwargs.pop("sampling_fps", 2))
|
|||
|
|
out = cls.load_bytes_opencv(
|
|||
|
|
data,
|
|||
|
|
frame_sample_mode,
|
|||
|
|
num_frames,
|
|||
|
|
max_fps,
|
|||
|
|
sampling_fps,
|
|||
|
|
**kwargs,
|
|||
|
|
)
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
|
|||
|
|
@VIDEO_LOADER_REGISTRY.register("openpangu")
|
|||
|
|
class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend):
|
|||
|
|
@classmethod
|
|||
|
|
def load_bytes(
|
|||
|
|
cls,
|
|||
|
|
data: bytes,
|
|||
|
|
num_frames: int = 32,
|
|||
|
|
fps: int = 1,
|
|||
|
|
max_duration: int = 300,
|
|||
|
|
frame_recovery: bool = False,
|
|||
|
|
**kwargs,
|
|||
|
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
|||
|
|
"""
|
|||
|
|
Load video frames with dynamic sampling based on duration.
|
|||
|
|
Assume that total_num_frames = 10 and fps = 1.
|
|||
|
|
The timestamp of frame 0 is 0.0.
|
|||
|
|
The timestamp of frame 1 is 1.0.…
|
|||
|
|
The timestamp of frame 9 (the last frame) should be 9.0, that is,
|
|||
|
|
(total_frames_num – 1) / original_fps.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
data: Raw video bytes
|
|||
|
|
num_frames: Not used in dynamic backend
|
|||
|
|
fps: Target FPS for sampling (default: 1)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Tuple of (frames_array, metadata_dict)
|
|||
|
|
"""
|
|||
|
|
import cv2
|
|||
|
|
|
|||
|
|
backend = cls().get_cv2_video_api()
|
|||
|
|
cap = cv2.VideoCapture(BytesIO(data), backend, [])
|
|||
|
|
if not cap.isOpened():
|
|||
|
|
raise ValueError("Could not open video stream")
|
|||
|
|
|
|||
|
|
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|||
|
|
original_fps = float(cap.get(cv2.CAP_PROP_FPS))
|
|||
|
|
# The timestamp of the rightmost frame, cannot be used to calculate frame 0.
|
|||
|
|
if total_frames_num >= 1 and original_fps > 0:
|
|||
|
|
total_duration = (total_frames_num - 1) / original_fps
|
|||
|
|
else:
|
|||
|
|
total_duration = 0
|
|||
|
|
|
|||
|
|
# `fps` is the FPS parameter passed in for sampling,
|
|||
|
|
# -1 indicates that sampling can be performed directly without FPS limitation.
|
|||
|
|
if fps > 0:
|
|||
|
|
# Num_frames is the maximum number of frames to sample.
|
|||
|
|
# If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501
|
|||
|
|
if num_frames >= int(total_duration * fps) + 1:
|
|||
|
|
num_frames = int(total_duration * fps) + 1
|
|||
|
|
# Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501
|
|||
|
|
# cannot be calculated for frame 0.
|
|||
|
|
total_duration = min(total_duration, (num_frames - 1) / fps)
|
|||
|
|
elif fps != -1:
|
|||
|
|
raise ValueError(
|
|||
|
|
f"requires dataset fps is -1 or greater than 0 but got {fps}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
sample_frame_timestamps = np.linspace(
|
|||
|
|
0, total_duration, num_frames, dtype=float
|
|||
|
|
)
|
|||
|
|
frames_indices = [
|
|||
|
|
min(total_frames_num - 1, round(t * original_fps))
|
|||
|
|
for t in sample_frame_timestamps
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
|
|||
|
|
cap, frames_indices, total_frames_num
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if recovered_map:
|
|||
|
|
logger.info(
|
|||
|
|
"Frame recovery: %d frames recovered using forward scan.",
|
|||
|
|
len(recovered_map),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
metadata = {
|
|||
|
|
"total_num_frames": total_frames_num,
|
|||
|
|
"fps": original_fps,
|
|||
|
|
"duration": total_duration,
|
|||
|
|
"video_backend": "opencv_dynamic_openpangu",
|
|||
|
|
"frames_indices": valid_frame_indices,
|
|||
|
|
"do_sample_frames": False,
|
|||
|
|
}
|
|||
|
|
return frames, metadata
|