v1.0

2026-03-05 18:06:10 +08:00
commit 809cecae09
2569 changed files with 478204 additions and 0 deletions
--- a/assets/init.py
+++ b/assets/init.py
--- a/assets/pycache/init.cpython-312.pyc
+++ b/assets/pycache/init.cpython-312.pyc
--- a/assets/pycache/audio.cpython-312.pyc
+++ b/assets/pycache/audio.cpython-312.pyc
--- a/assets/pycache/base.cpython-312.pyc
+++ b/assets/pycache/base.cpython-312.pyc
--- a/assets/pycache/image.cpython-312.pyc
+++ b/assets/pycache/image.cpython-312.pyc
--- a/assets/pycache/video.cpython-312.pyc
+++ b/assets/pycache/video.cpython-312.pyc
--- a/assets/audio.py
+++ b/assets/audio.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
+from urllib.parse import urljoin
+
+import numpy.typing as npt
+
+from vllm.utils.import_utils import PlaceholderModule
+
+from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+ASSET_DIR = "multimodal_asset"
+
+AudioAssetName = Literal["winning_call", "mary_had_lamb"]
+
+
+@dataclass(frozen=True)
+class AudioAsset:
+    name: AudioAssetName
+
+    @property
+    def filename(self) -> str:
+        return f"{self.name}.ogg"
+
+    @property
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
+        audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
+        return librosa.load(audio_path, sr=None)
+
+    def get_local_path(self) -> Path:
+        return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
+
+    @property
+    def url(self) -> str:
+        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
--- a/assets/base.py
+++ b/assets/base.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import lru_cache
+from pathlib import Path
+
+import vllm.envs as envs
+from vllm.connections import global_http_connection
+
+VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
+
+
+def get_cache_dir() -> Path:
+    """Get the path to the cache for storing downloaded assets."""
+    path = Path(envs.VLLM_ASSETS_CACHE)
+    path.mkdir(parents=True, exist_ok=True)
+
+    return path
+
+
+@lru_cache
+def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path:
+    """
+    Download an asset file from `s3://vllm-public-assets`
+    and return the path to the downloaded file.
+    """
+    asset_directory = get_cache_dir() / "vllm_public_assets"
+    asset_directory.mkdir(parents=True, exist_ok=True)
+
+    asset_path = asset_directory / filename
+    if not asset_path.exists():
+        if s3_prefix is not None:
+            filename = s3_prefix + "/" + filename
+        global_http_connection.download_file(
+            f"{VLLM_S3_BUCKET_URL}/{filename}",
+            asset_path,
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
+
+    return asset_path
--- a/assets/image.py
+++ b/assets/image.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
+
+import torch
+from PIL import Image
+
+from .base import get_vllm_public_assets
+
+VLM_IMAGES_DIR = "vision_model_images"
+
+ImageAssetName = Literal[
+    "stop_sign",
+    "cherry_blossom",
+    "hato",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
+    "Grayscale_8bits_palette_sample_image",
+    "1280px-Venn_diagram_rgb",
+    "RGBA_comp",
+    "237-400x300",
+    "231-200x300",
+    "27-500x500",
+    "17-150x600",
+    "handelsblatt-preview",
+    "paper-11",
+]
+
+
+@dataclass(frozen=True)
+class ImageAsset:
+    name: ImageAssetName
+
+    def get_path(self, ext: str) -> Path:
+        """
+        Return s3 path for given image.
+        """
+        return get_vllm_public_assets(
+            filename=f"{self.name}.{ext}", s3_prefix=VLM_IMAGES_DIR
+        )
+
+    @property
+    def pil_image(self, ext="jpg") -> Image.Image:
+        image_path = self.get_path(ext)
+        return Image.open(image_path)
+
+    @property
+    def image_embeds(self) -> torch.Tensor:
+        """
+        Image embeddings, only used for testing purposes with llava 1.5.
+        """
+        image_path = self.get_path("pt")
+        return torch.load(image_path, map_location="cpu", weights_only=True)
+
+    def read_bytes(self, ext: str) -> bytes:
+        p = Path(self.get_path(ext))
+        return p.read_bytes()
--- a/assets/video.py
+++ b/assets/video.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, ClassVar, Literal
+
+import numpy as np
+import numpy.typing as npt
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from vllm.utils.import_utils import PlaceholderModule
+
+from .base import get_cache_dir
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+
+@lru_cache
+def download_video_asset(filename: str) -> str:
+    """
+    Download and open an image from huggingface
+    repo: raushan-testing-hf/videos-test
+    """
+    video_directory = get_cache_dir() / "video-example-data"
+    video_directory.mkdir(parents=True, exist_ok=True)
+
+    video_path = video_directory / filename
+    video_path_str = str(video_path)
+    if not video_path.exists():
+        video_path_str = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test",
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=video_directory,
+        )
+    return video_path_str
+
+
+def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    import cv2
+
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frames = []
+
+    num_frames = num_frames if num_frames > 0 else total_frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for idx in range(total_frames):
+        ok = cap.grab()  # next img
+        if not ok:
+            break
+        if idx in frame_indices:  # only decompress needed
+            ret, frame = cap.retrieve()
+            if ret:
+                # OpenCV uses BGR format, we need to convert it to RGB
+                # for PIL and transformers compatibility
+                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+    frames = np.stack(frames)
+    if len(frames) < num_frames:
+        raise ValueError(
+            f"Could not read enough frames from video file {path}"
+            f" (expected {num_frames} frames, got {len(frames)})"
+        )
+    return frames
+
+
+def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Image]:
+    frames = video_to_ndarrays(path, num_frames)
+    return [Image.fromarray(frame) for frame in frames]
+
+
+def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
+    import cv2
+
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    duration = total_frames / fps if fps > 0 else 0
+
+    if num_frames == -1 or num_frames > total_frames:
+        num_frames = total_frames
+
+    metadata = {
+        "total_num_frames": num_frames,
+        "fps": duration / num_frames,
+        "duration": duration,
+        "video_backend": "opencv",
+        "frames_indices": list(range(num_frames)),
+        # extra field used to control hf processor's video
+        # sampling behavior
+        "do_sample_frames": num_frames == total_frames,
+    }
+    return metadata
+
+
+VideoAssetName = Literal["baby_reading"]
+
+
+@dataclass(frozen=True)
+class VideoAsset:
+    name: VideoAssetName
+    num_frames: int = -1
+
+    _NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = {
+        "baby_reading": "sample_demo_1.mp4",
+    }
+
+    @property
+    def filename(self) -> str:
+        return self._NAME_TO_FILE[self.name]
+
+    @property
+    def video_path(self) -> str:
+        return download_video_asset(self.filename)
+
+    @property
+    def pil_images(self) -> list[Image.Image]:
+        ret = video_to_pil_images_list(self.video_path, self.num_frames)
+        return ret
+
+    @property
+    def np_ndarrays(self) -> npt.NDArray:
+        ret = video_to_ndarrays(self.video_path, self.num_frames)
+        return ret
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        ret = video_get_metadata(self.video_path, self.num_frames)
+        return ret
+
+    def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray:
+        """
+        Read audio data from the video asset, used in Qwen2.5-Omni examples.
+
+        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
+        """
+        return librosa.load(self.video_path, sr=sampling_rate)[0]