Files

837 lines
29 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from abc import abstractmethod
from io import BytesIO
from typing import TYPE_CHECKING, Any, cast
import numpy as np
import numpy.typing as npt
if TYPE_CHECKING:
import cv2
from vllm.logger import init_logger
from vllm.utils.registry import ExtensionManager
logger = init_logger(__name__)
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
num_frames, _, _, channels = frames.shape
new_height, new_width = size
resized_frames = np.empty(
(num_frames, new_height, new_width, channels), dtype=frames.dtype
)
# lazy import cv2 to avoid bothering users who only use text models
import cv2
for i, frame in enumerate(frames):
resized_frame = cv2.resize(frame, (new_width, new_height))
resized_frames[i] = resized_frame
return resized_frames
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
_, height, width, _ = frames.shape
new_height = int(height * size_factor)
new_width = int(width * size_factor)
return resize_video(frames, (new_height, new_width))
def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArray:
total_frames = frames.shape[0]
if num_frames == -1:
return frames
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
sampled_frames = frames[frame_indices, ...]
return sampled_frames
class VideoLoader:
@classmethod
@abstractmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict[str, Any]]:
raise NotImplementedError
@staticmethod
def _can_use_for_recovery(
idx: int,
failed_frames: list[int],
next_target_map: dict[int, int],
total_frames: int,
) -> bool:
"""Check if current frame can recover the oldest failed frame."""
if not failed_frames:
return False
oldest_failed = failed_frames[0]
limit = next_target_map.get(oldest_failed, total_frames)
return idx < limit
@staticmethod
def _read_frames_with_recovery(
cap: "cv2.VideoCapture",
frame_indices: list[int],
total_frames: int,
) -> tuple[npt.NDArray, list[int], dict[int, int]]:
"""
Read frames with dynamic window forward-scan recovery.
When a target frame fails to load, the next successfully grabbed
frame (before the next target frame) will be used to recover it.
Args:
cap: OpenCV VideoCapture object
frame_indices: Sorted list of target frame indices to load
total_frames: Total number of frames in the video
Returns:
Tuple of (frames_array, valid_frame_indices, recovered_map)
- frames_array: Array of loaded frames
- valid_frame_indices: List of frame indices that were loaded
- recovered_map: Dict mapping recovered_idx -> source_idx
"""
import cv2
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
assert width > 0 and height > 0, (
f"Invalid video frame size: width={width}, height={height}"
)
frame_idx_set = set(frame_indices)
max_frame_idx = frame_indices[-1] if frame_indices else 0
# Build map: target_idx -> next_target_idx (for recovery window)
next_target_map: dict[int, int] = {}
for k in range(len(frame_indices) - 1):
next_target_map[frame_indices[k]] = frame_indices[k + 1]
next_target_map[frame_indices[-1]] = total_frames
frames_list: list[npt.NDArray] = []
valid_frame_indices: list[int] = []
failed_frames_idx: list[int] = []
recovered_map: dict[int, int] = {}
i = 0
for idx in range(max_frame_idx + 1):
is_target_frame = idx in frame_idx_set
# Attempt to grab the current frame
ok = cap.grab()
if not ok:
if is_target_frame:
logger.warning(
"Failed to grab frame %d during video loading.",
idx,
)
failed_frames_idx.append(idx)
continue
# Check if we should retrieve: target frame OR can recover a failed one
can_recover = VideoLoader._can_use_for_recovery(
idx, failed_frames_idx, next_target_map, total_frames
)
if is_target_frame or can_recover:
ret, frame = cap.retrieve()
if ret and frame is not None and frame.size > 0:
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames_list.append(rgb_frame)
valid_frame_indices.append(idx)
i += 1
if can_recover:
recovered_idx = failed_frames_idx.pop(0)
recovered_map[recovered_idx] = idx
logger.info(
"Recovered frame %d using frame %d (delay: %d)",
recovered_idx,
idx,
idx - recovered_idx,
)
elif is_target_frame:
logger.warning(
"Failed to retrieve frame %d during video loading.",
idx,
)
failed_frames_idx.append(idx)
# Log any remaining failed frames
for failed_idx in failed_frames_idx:
logger.warning(
"Frame %d could not be recovered (end of video).",
failed_idx,
)
# Stack frames
if frames_list:
frames = np.stack(frames_list)
else:
frames = np.empty((0, height, width, 3), dtype=np.uint8)
return frames, valid_frame_indices, recovered_map
@staticmethod
def _read_frames(
cap,
frame_indices: set[int],
num_expected_frames: int,
max_frame_idx: int,
) -> tuple[npt.NDArray, int, list[int]]:
import cv2
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
i = 0
valid_frame_indices = []
for idx in range(max_frame_idx + 1):
ok = cap.grab()
if not ok:
# Frame is broken/unreadable, log warning
if idx in frame_indices:
logger.warning(
"Failed to grab frame %d during video loading. "
"This frame will be skipped.",
idx,
)
continue
if idx in frame_indices:
ret, frame = cap.retrieve()
if ret:
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
valid_frame_indices.append(idx)
i += 1
else:
# retrieve() failed even though grab() succeeded
logger.warning(
"Failed to retrieve frame %d during video loading. "
"This frame will be skipped.",
idx,
)
valid_num_frames = len(valid_frame_indices)
if valid_num_frames < num_expected_frames:
logger.warning(
"Video loading completed with %d broken/unreadable frames. "
"Expected %d frames but only loaded %d frames.",
num_expected_frames - valid_num_frames,
num_expected_frames,
valid_num_frames,
)
return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
VIDEO_LOADER_REGISTRY = ExtensionManager()
@VIDEO_LOADER_REGISTRY.register("opencv")
class OpenCVVideoBackend(VideoLoader):
def get_cv2_video_api(self):
import cv2.videoio_registry as vr
api_pref = None
for backend in vr.getStreamBufferedBackends():
if not vr.hasBackend(backend):
continue
if not vr.isBackendBuiltIn(backend):
_, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
if abi < 1 or (abi == 1 and api < 2):
continue
api_pref = backend
break
return api_pref
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
fps: int = -1,
max_duration: int = 300,
frame_recovery: bool = False,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Load video frames from bytes.
Args:
data: Raw video bytes
num_frames: Target number of frames to sample (-1 for all)
fps: Target FPS for sampling (-1 for original)
max_duration: Maximum duration (unused in base backend)
frame_recovery: Enable forward-scan recovery for failed frames
Returns:
Tuple of (frames_array, metadata_dict)
"""
import cv2
backend = cls().get_cv2_video_api()
cap = cv2.VideoCapture(BytesIO(data), backend, [])
if not cap.isOpened():
raise ValueError("Could not open video stream")
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
original_fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames_num / original_fps if original_fps > 0 else 0
# resample video to target num_frames and fps
# - the minimum of the two will be used
num_frames_to_sample = total_frames_num
if num_frames > 0:
num_frames_to_sample = min(num_frames, total_frames_num)
if fps > 0:
num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
num_frames_to_sample = max(1, num_frames_to_sample) # at least one sample
if num_frames_to_sample == total_frames_num:
frame_idx = list(range(0, num_frames_to_sample))
else:
uniform_sampled_frames = np.linspace(
0, total_frames_num - 1, num_frames_to_sample, dtype=int
)
frame_idx = uniform_sampled_frames.tolist()
if frame_recovery:
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
cap, frame_idx, total_frames_num
)
valid_num_frames = len(valid_frame_indices)
if recovered_map:
logger.info(
"Frame recovery: %d frames recovered using forward scan.",
len(recovered_map),
)
else:
frame_idx_set = set(frame_idx)
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
)
# Use transformers transformers.video_utils.VideoMetadata format
# NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
# can cause incorrect timestamp calculation without num_frames=-1.
metadata = {
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": duration,
"video_backend": "opencv",
"frames_indices": valid_frame_indices,
# extra field used to control hf processor's video
# sampling behavior
"do_sample_frames": valid_num_frames == total_frames_num,
}
return frames, metadata
@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
fps: int = 2,
max_duration: int = 300,
frame_recovery: bool = False,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Load video frames with dynamic sampling based on duration.
Args:
data: Raw video bytes
num_frames: Not used in dynamic backend
fps: Target FPS for sampling (default: 2)
max_duration: Maximum video duration to process (default: 300s)
frame_recovery: Enable forward-scan recovery for failed frames
Returns:
Tuple of (frames_array, metadata_dict)
"""
import cv2
backend = cls().get_cv2_video_api()
cap = cv2.VideoCapture(BytesIO(data), backend, [])
if not cap.isOpened():
raise ValueError("Could not open video stream")
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
original_fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames_num / original_fps if original_fps > 0 else 0
# resample video to target num_frames
max_frame_idx = total_frames_num - 1
duration = duration or round(max_frame_idx / original_fps) + 1
# Refer to:
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
frame_indices_list: list[int]
if duration <= max_duration:
n = int(math.floor(duration * fps))
frame_indices_list = sorted(
{
min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
for i in range(n)
}
)
else:
num_samples = int(max_duration * fps)
if num_samples >= total_frames_num:
frame_indices_list = list(range(total_frames_num))
else:
target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
frame_indices_list = sorted(
{
min(max_frame_idx, int(math.ceil(t * original_fps)))
for t in target_seconds
}
)
if frame_recovery:
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
cap, frame_indices_list, total_frames_num
)
valid_num_frames = len(valid_frame_indices)
if recovered_map:
logger.info(
"Frame recovery: %d frames recovered using forward scan.",
len(recovered_map),
)
else:
frame_indices_set = set(frame_indices_list)
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
cap, frame_indices_set, len(frame_indices_list), total_frames_num - 1
)
# Use transformers transformers.video_utils.VideoMetadata format
metadata = {
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": duration,
"video_backend": "opencv_dynamic",
"frames_indices": valid_frame_indices,
"do_sample_frames": False,
}
return frames, metadata
@VIDEO_LOADER_REGISTRY.register("molmo2")
class Molmo2VideoBackend(VideoLoader):
def get_cv2_video_api(self):
import cv2.videoio_registry as vr
api_pref = None
for backend in vr.getStreamBufferedBackends():
if not vr.hasBackend(backend):
continue
if not vr.isBackendBuiltIn(backend):
_, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
if abi < 1 or (abi == 1 and api < 2):
continue
api_pref = backend
break
return api_pref
@classmethod
def get_candidate_target_fps(
cls,
video_fps: float,
sampling_fps: float,
max_fps: float = 8.0,
) -> list[float]:
"""
Return the subset of `video_fps` factors that remain multiples
of `sampling_fps`.
Examples:
>>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
[2, 6]
>>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
[1, 5]
>>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
[2]
>>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
Traceback (most recent call last):
...
ValueError: sampling_fps=2 must divide video_fps=5 to produce
consistent frame steps.
"""
video_fps = int(video_fps)
sampling_fps = int(sampling_fps)
max_fps = int(max_fps)
if sampling_fps is None:
raise ValueError("sampling_fps must be provided")
if video_fps <= 0 or sampling_fps <= 0:
raise ValueError(
"video_fps and sampling_fps must be positive "
f"(got {video_fps}, {sampling_fps})"
)
if video_fps % sampling_fps != 0:
raise ValueError(
f"sampling_fps={sampling_fps} must divide video_fps={video_fps}."
)
candidates = []
for candidate in range(sampling_fps, video_fps + 1, sampling_fps):
if candidate > max_fps:
break
if video_fps % candidate == 0:
candidates.append(float(candidate))
return candidates
@classmethod
def get_target_fps(
cls,
video_fps: float,
max_frames: int,
total_frames: int,
frame_sample_mode: str,
candidate_target_fps: list[float],
) -> float | None:
"""
Get the target fps that best spans the videoand has the most frames sampled
"""
num_frames_sampled = 0
selected_target_fps = None
for target_fps in candidate_target_fps:
step_size = max(int(video_fps / target_fps), 1)
num_frames_sampled_at_fps = int(total_frames / step_size)
if num_frames_sampled == 0:
if (
"uniform" in frame_sample_mode
and num_frames_sampled_at_fps > max_frames
):
break
selected_target_fps = target_fps
num_frames_sampled = num_frames_sampled_at_fps
else:
# the candidate sampling fps increases so frame count can't decrease
assert num_frames_sampled <= num_frames_sampled_at_fps
if num_frames_sampled_at_fps > max_frames:
# choose the sampling fps that spans the video
continue
elif num_frames_sampled_at_fps > num_frames_sampled:
# both are less than max_frames; choose the one with higher
# density of frames sampled
selected_target_fps = target_fps
num_frames_sampled = num_frames_sampled_at_fps
return selected_target_fps
@classmethod
def get_frame_times_and_chosen_fps(
cls,
selected_target_fps: float | None,
total_frames: int,
max_frames: int,
video_fps: float,
) -> tuple[float | None, npt.NDArray]:
if selected_target_fps is None:
frame_indices = np.linspace(
0, total_frames, max_frames, endpoint=False, dtype=int
)
else:
step_size = max(int(video_fps / selected_target_fps), 1)
frame_indices = np.arange(0, total_frames, step_size)
if len(frame_indices) > max_frames:
frame_indices = frame_indices[:max_frames]
return selected_target_fps, frame_indices
@classmethod
def sample_times(
cls,
duration: float,
max_frames: int,
frame_sample_mode: str,
max_fps: int | None,
candidate_target_fps: list[float] | None = None,
**kwargs,
) -> npt.NDArray:
if frame_sample_mode == "fps":
assert candidate_target_fps is not None
# Try larger and larger FPSs until we hit one that can't span the video
sampling_fps = candidate_target_fps[0]
for candidate_fps in candidate_target_fps[1:]:
if max_frames / candidate_fps < duration:
break
sampling_fps = candidate_fps
times = np.arange(0, max_frames) / sampling_fps
times = times[times < duration]
return times
elif frame_sample_mode == "uniform_last_frame":
if max_fps is not None:
max_duration = (
max_frames - 1
) / max_fps # -1 to include the last frame
if max_duration < duration:
times = np.linspace(
0, duration, num=max_frames, endpoint=True, dtype=np.float64
)
else:
times = np.arange(0.0, stop=duration, step=1 / max_fps)
times = np.concatenate([times, [duration]], axis=0)
assert len(times) <= max_frames
else:
times = np.linspace(
0, duration, num=max_frames, endpoint=True, dtype=np.float64
)
return times
else:
raise NotImplementedError(frame_sample_mode)
@classmethod
def _sample_frames(
cls,
total_num_frames: int,
video_fps: float,
duration: float,
frame_sample_mode: str,
num_frames: int,
max_fps: int,
sampling_fps: int,
) -> npt.NDArray:
if frame_sample_mode == "uniform_last_frame" and max_fps is not None:
if total_num_frames <= 2:
indices = np.arange(total_num_frames).astype(int)
elif duration > (num_frames - 1) / max_fps: # -1 to include the last frame
# uniform fallback
indices = np.linspace(
0,
total_num_frames - 1,
num=min(num_frames, total_num_frames),
endpoint=True,
).astype(int)
else:
float_indices = np.arange(
0.0,
stop=total_num_frames - 1,
step=float(video_fps / max_fps),
)
if np.round(float_indices[-1]) != total_num_frames - 1:
float_indices = np.concatenate(
[float_indices, [total_num_frames - 1]], axis=0
)
indices = np.round(float_indices).astype(int)
assert indices[-1] < total_num_frames
assert len(float_indices) <= num_frames
elif frame_sample_mode == "uniform_last_frame":
indices = np.linspace(
0,
total_num_frames - 1,
num=min(num_frames, total_num_frames),
endpoint=True,
).astype(int)
elif frame_sample_mode == "fps":
candidate_target_fps = cls.get_candidate_target_fps(video_fps, sampling_fps)
selected_target_fps = cls.get_target_fps(
video_fps,
num_frames,
total_num_frames,
frame_sample_mode,
candidate_target_fps,
)
_, indices = cls.get_frame_times_and_chosen_fps(
selected_target_fps,
total_num_frames,
num_frames,
video_fps,
)
else:
raise NotImplementedError(frame_sample_mode)
return indices
@classmethod
def load_bytes_opencv(
cls,
data: bytes,
frame_sample_mode: str | None = None,
num_frames: int = -1,
max_fps: int = 2,
sampling_fps: int = 2,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
import cv2
backend = cls().get_cv2_video_api()
cap = cv2.VideoCapture(BytesIO(data), backend, [])
if not cap.isOpened():
raise ValueError("Could not open video stream")
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
original_fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames_num / original_fps if original_fps > 0 else 0
if frame_sample_mode is None:
# Use transformers transformers.video_utils.VideoMetadata format
frame_idx = list(range(0, total_frames_num))
frame_idx_set = set(frame_idx)
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
cap, frame_idx_set, total_frames_num, max(frame_idx)
)
do_sample_frames = valid_num_frames == total_frames_num
metadata = {
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": duration,
"video_backend": "opencv",
"do_sample_frames": do_sample_frames,
}
if not do_sample_frames:
metadata["frames_indices"] = valid_frame_indices
return frames, metadata
frame_idx = cls._sample_frames(
total_frames_num,
original_fps,
duration,
frame_sample_mode,
num_frames,
max_fps,
sampling_fps,
).tolist()
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
cap,
set(frame_idx),
len(frame_idx),
total_frames_num - 1,
)
metadata = {
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": duration,
"video_backend": "opencv",
"frames_indices": valid_frame_indices,
"do_sample_frames": False,
}
return frames, metadata
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
frame_sample_mode = cast(str | None, kwargs.pop("frame_sample_mode", None))
max_fps = cast(int, kwargs.pop("max_fps", 2))
sampling_fps = cast(int, kwargs.pop("sampling_fps", 2))
out = cls.load_bytes_opencv(
data,
frame_sample_mode,
num_frames,
max_fps,
sampling_fps,
**kwargs,
)
return out
@VIDEO_LOADER_REGISTRY.register("openpangu")
class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend):
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = 32,
fps: int = 1,
max_duration: int = 300,
frame_recovery: bool = False,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Load video frames with dynamic sampling based on duration.
Assume that total_num_frames = 10 and fps = 1.
The timestamp of frame 0 is 0.0.
The timestamp of frame 1 is 1.0.…
The timestamp of frame 9 (the last frame) should be 9.0, that is,
(total_frames_num 1) / original_fps.
Args:
data: Raw video bytes
num_frames: Not used in dynamic backend
fps: Target FPS for sampling (default: 1)
Returns:
Tuple of (frames_array, metadata_dict)
"""
import cv2
backend = cls().get_cv2_video_api()
cap = cv2.VideoCapture(BytesIO(data), backend, [])
if not cap.isOpened():
raise ValueError("Could not open video stream")
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
original_fps = float(cap.get(cv2.CAP_PROP_FPS))
# The timestamp of the rightmost frame, cannot be used to calculate frame 0.
if total_frames_num >= 1 and original_fps > 0:
total_duration = (total_frames_num - 1) / original_fps
else:
total_duration = 0
# `fps` is the FPS parameter passed in for sampling,
# -1 indicates that sampling can be performed directly without FPS limitation.
if fps > 0:
# Num_frames is the maximum number of frames to sample.
# If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501
if num_frames >= int(total_duration * fps) + 1:
num_frames = int(total_duration * fps) + 1
# Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501
# cannot be calculated for frame 0.
total_duration = min(total_duration, (num_frames - 1) / fps)
elif fps != -1:
raise ValueError(
f"requires dataset fps is -1 or greater than 0 but got {fps}"
)
sample_frame_timestamps = np.linspace(
0, total_duration, num_frames, dtype=float
)
frames_indices = [
min(total_frames_num - 1, round(t * original_fps))
for t in sample_frame_timestamps
]
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
cap, frames_indices, total_frames_num
)
if recovered_map:
logger.info(
"Frame recovery: %d frames recovered using forward scan.",
len(recovered_map),
)
metadata = {
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": total_duration,
"video_backend": "opencv_dynamic_openpangu",
"frames_indices": valid_frame_indices,
"do_sample_frames": False,
}
return frames, metadata