Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/multimodal/media/init.py
+++ b/vllm/multimodal/media/init.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .audio import AudioEmbeddingMediaIO, AudioMediaIO
+from .base import MediaIO, MediaWithBytes
+from .connector import MEDIA_CONNECTOR_REGISTRY, MediaConnector
+from .image import ImageEmbeddingMediaIO, ImageMediaIO
+from .video import VIDEO_LOADER_REGISTRY, VideoMediaIO
+
+__all__ = [
+    "MediaIO",
+    "MediaWithBytes",
+    "AudioEmbeddingMediaIO",
+    "AudioMediaIO",
+    "ImageEmbeddingMediaIO",
+    "ImageMediaIO",
+    "VIDEO_LOADER_REGISTRY",
+    "VideoMediaIO",
+    "MEDIA_CONNECTOR_REGISTRY",
+    "MediaConnector",
+]
--- a/vllm/multimodal/media/audio.py
+++ b/vllm/multimodal/media/audio.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+from io import BytesIO
+from pathlib import Path
+
+import numpy.typing as npt
+import pybase64
+import torch
+
+from vllm.utils.import_utils import PlaceholderModule
+from vllm.utils.serial_utils import tensor2base64
+
+from .base import MediaIO
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+
+
+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
+        return librosa.load(BytesIO(data), sr=None)
+
+    def load_base64(
+        self,
+        media_type: str,
+        data: str,
+    ) -> tuple[npt.NDArray, float]:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
+        return librosa.load(filepath, sr=None)
+
+    def encode_base64(
+        self,
+        media: tuple[npt.NDArray, int],
+        *,
+        audio_format: str = "WAV",
+    ) -> str:
+        audio, sr = media
+
+        with BytesIO() as buffer:
+            soundfile.write(buffer, audio, sr, format=audio_format)
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode("utf-8")
+
+
+class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(buffer, weights_only=True)
+            return tensor.to_dense()
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(filepath, weights_only=True)
+            return tensor.to_dense()
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        return tensor2base64(media)
--- a/vllm/multimodal/media/base.py
+++ b/vllm/multimodal/media/base.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Generic, TypeVar
+
+import numpy as np
+
+_T = TypeVar("_T")
+
+
+@dataclass
+class MediaWithBytes(Generic[_T]):
+    """
+    Wrapper that couples a media object with its original encoded bytes.
+
+    This ensures the raw bytes and media object remain synchronized,
+    preventing cache corruption from in-place modifications.
+
+    The wrapper delegates attribute access to the underlying media object,
+    making it behave transparently like the wrapped type (e.g., PIL.Image).
+
+    NOTE: Currently, this wrapper is used only for the image modality.
+    """
+
+    media: _T
+    original_bytes: bytes = field(repr=False)
+
+    def __array__(self, *args, **kwargs) -> np.ndarray:
+        """Allow np.array(obj) to return np.array(obj.media)."""
+        return np.array(self.media, *args, **kwargs)
+
+    def __getstate__(self):
+        return self.__dict__.copy()
+
+    def __setstate__(self, state: dict[str, Any]):
+        self.__dict__.update(state)
+
+    def __getattr__(self, name: str):
+        """Delegate attribute access to the underlying media object."""
+        return getattr(self.media, name)
+
+
+class MediaIO(ABC, Generic[_T]):
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        """
+        List of media types:
+        https://www.iana.org/assignments/media-types/media-types.xhtml
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        raise NotImplementedError
--- a/vllm/multimodal/media/connector.py
+++ b/vllm/multimodal/media/connector.py
@@ -0,0 +1,343 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import atexit
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Any, TypeVar
+from urllib.request import url2pathname
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from PIL import Image, UnidentifiedImageError
+from urllib3.util import Url, parse_url
+
+import vllm.envs as envs
+from vllm.connections import HTTPConnection, global_http_connection
+from vllm.utils.registry import ExtensionManager
+
+from .audio import AudioEmbeddingMediaIO, AudioMediaIO
+from .base import MediaIO
+from .image import ImageEmbeddingMediaIO, ImageMediaIO
+from .video import VideoMediaIO
+
+_M = TypeVar("_M")
+
+global_thread_pool = ThreadPoolExecutor(
+    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT
+)
+atexit.register(global_thread_pool.shutdown)
+
+MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
+
+
+@MEDIA_CONNECTOR_REGISTRY.register("http")
+class MediaConnector:
+    def __init__(
+        self,
+        media_io_kwargs: dict[str, dict[str, Any]] | None = None,
+        connection: HTTPConnection = global_http_connection,
+        *,
+        allowed_local_media_path: str = "",
+        allowed_media_domains: list[str] | None = None,
+    ) -> None:
+        """
+        Args:
+            media_io_kwargs: Additional args passed to process media
+                             inputs, keyed by modalities. For example,
+                             to set num_frames for video, set
+                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
+            connection: HTTP connection client to download media contents.
+            allowed_local_media_path: A local directory to load media files from.
+            allowed_media_domains: If set, only media URLs that belong to this
+                                   domain can be used for multi-modal inputs.
+        """
+        super().__init__()
+
+        self.media_io_kwargs: dict[str, dict[str, Any]] = (
+            media_io_kwargs if media_io_kwargs else {}
+        )
+        self.connection = connection
+
+        if allowed_local_media_path:
+            allowed_local_media_path_ = Path(allowed_local_media_path)
+
+            if not allowed_local_media_path_.exists():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} does not exist."
+                )
+            if not allowed_local_media_path_.is_dir():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} must be a directory."
+                )
+        else:
+            allowed_local_media_path_ = None
+
+        self.allowed_local_media_path = allowed_local_media_path_
+        if allowed_media_domains is None:
+            allowed_media_domains = []
+        self.allowed_media_domains = allowed_media_domains
+
+    def _load_data_url(
+        self,
+        url_spec: Url,
+        media_io: MediaIO[_M],
+    ) -> _M:  # type: ignore[type-var]
+        url_spec_path = url_spec.path or ""
+        data_spec, data = url_spec_path.split(",", 1)
+        media_type, data_type = data_spec.split(";", 1)
+        # media_type starts with a leading "/" (e.g., "/video/jpeg")
+        media_type = media_type.lstrip("/")
+
+        if data_type != "base64":
+            msg = "Only base64 data URLs are supported for now."
+            raise NotImplementedError(msg)
+
+        return media_io.load_base64(media_type, data)
+
+    def _load_file_url(
+        self,
+        url_spec: Url,
+        media_io: MediaIO[_M],
+    ) -> _M:  # type: ignore[type-var]
+        allowed_local_media_path = self.allowed_local_media_path
+        if allowed_local_media_path is None:
+            raise RuntimeError(
+                "Cannot load local files without `--allowed-local-media-path`."
+            )
+
+        url_spec_path = url_spec.path or ""
+        url_spec_netloc = url_spec.netloc or ""
+        filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
+        if allowed_local_media_path not in filepath.resolve().parents:
+            raise ValueError(
+                f"The file path {filepath} must be a subpath "
+                f"of `--allowed-local-media-path {allowed_local_media_path}`."
+            )
+
+        return media_io.load_file(filepath)
+
+    def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
+        if (
+            self.allowed_media_domains
+            and url_spec.hostname not in self.allowed_media_domains
+        ):
+            raise ValueError(
+                f"The URL must be from one of the allowed domains: "
+                f"{self.allowed_media_domains}. Input URL domain: "
+                f"{url_spec.hostname}"
+            )
+
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: int | None = None,
+    ) -> _M:  # type: ignore[type-var]
+        url_spec = parse_url(url)
+
+        if url_spec.scheme and url_spec.scheme.startswith("http"):
+            self._assert_url_in_allowed_media_domains(url_spec)
+
+            connection = self.connection
+            data = connection.get_bytes(
+                url_spec.url,
+                timeout=fetch_timeout,
+                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
+            )
+
+            return media_io.load_bytes(data)
+
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
+
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
+
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: int | None = None,
+    ) -> _M:
+        url_spec = parse_url(url)
+        loop = asyncio.get_running_loop()
+
+        if url_spec.scheme and url_spec.scheme.startswith("http"):
+            self._assert_url_in_allowed_media_domains(url_spec)
+
+            connection = self.connection
+            data = await connection.async_get_bytes(
+                url_spec.url,
+                timeout=fetch_timeout,
+                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
+            )
+            future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
+            return await future
+
+        if url_spec.scheme == "data":
+            future = loop.run_in_executor(
+                global_thread_pool, self._load_data_url, url_spec, media_io
+            )
+            return await future
+
+        if url_spec.scheme == "file":
+            future = loop.run_in_executor(
+                global_thread_pool, self._load_file_url, url_spec, media_io
+            )
+            return await future
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+
+    def fetch_audio(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, int | float]:
+        """
+        Load audio from a URL.
+        """
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
+
+        return self.load_from_url(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+
+    async def fetch_audio_async(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, int | float]:
+        """
+        Asynchronously fetch audio from a URL.
+        """
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
+
+        return await self.load_from_url_async(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Load a PIL image from an HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+
+        try:
+            return self.load_from_url(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
+
+    async def fetch_image_async(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from an HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+
+        try:
+            return await self.load_from_url_async(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
+
+    def fetch_video(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Load video from an HTTP or base64 data URL.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
+
+        return self.load_from_url(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+
+    async def fetch_video_async(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Asynchronously load video from an HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
+
+        return await self.load_from_url_async(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+
+    def fetch_image_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load image embedding from a URL.
+        """
+        image_embedding_io = ImageEmbeddingMediaIO()
+
+        return image_embedding_io.load_base64("", data)
+
+    def fetch_audio_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load audio embedding from a URL.
+        """
+        audio_embedding_io = AudioEmbeddingMediaIO()
+
+        return audio_embedding_io.load_base64("", data)
--- a/vllm/multimodal/media/image.py
+++ b/vllm/multimodal/media/image.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from io import BytesIO
+from pathlib import Path
+
+import pybase64
+import torch
+from PIL import Image
+
+from vllm.utils.serial_utils import tensor2base64
+
+from ..image import convert_image_mode, rgba_to_rgb
+from .base import MediaIO, MediaWithBytes
+
+
+class ImageMediaIO(MediaIO[Image.Image]):
+    def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
+        super().__init__()
+
+        self.image_mode = image_mode
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
+
+        # Extract RGBA background color from kwargs if provided
+        # Default to white background for backward compatibility
+        rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
+        # Convert list to tuple for consistency
+        if isinstance(rgba_bg, list):
+            rgba_bg = tuple(rgba_bg)
+
+        # Validate rgba_background_color format
+        if not (
+            isinstance(rgba_bg, tuple)
+            and len(rgba_bg) == 3
+            and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
+        ):
+            raise ValueError(
+                "rgba_background_color must be a list or tuple of 3 integers "
+                "in the range [0, 255]."
+            )
+        self.rgba_background_color = rgba_bg
+
+    def _convert_image_mode(
+        self, image: Image.Image | MediaWithBytes[Image.Image]
+    ) -> Image.Image:
+        """Convert image mode with custom background color."""
+        if isinstance(image, MediaWithBytes):
+            image = image.media
+        if image.mode == self.image_mode:
+            return image
+        elif image.mode == "RGBA" and self.image_mode == "RGB":
+            return rgba_to_rgb(image, self.rgba_background_color)
+        else:
+            return convert_image_mode(image, self.image_mode)
+
+    def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
+        image = Image.open(BytesIO(data))
+        return MediaWithBytes(self._convert_image_mode(image), data)
+
+    def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
+        with open(filepath, "rb") as f:
+            data = f.read()
+        image = Image.open(BytesIO(data))
+        return MediaWithBytes(self._convert_image_mode(image), data)
+
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "PNG",
+    ) -> str:
+        image = media
+
+        with BytesIO() as buffer:
+            image = self._convert_image_mode(image)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+
+        return pybase64.b64encode(data).decode("utf-8")
+
+
+class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(buffer, weights_only=True)
+            return tensor.to_dense()
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        # Enable sparse tensor integrity checks to prevent out-of-bounds
+        # writes from maliciously crafted tensors
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.load(filepath, weights_only=True)
+            return tensor.to_dense()
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        return tensor2base64(media)
--- a/vllm/multimodal/media/video.py
+++ b/vllm/multimodal/media/video.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+from functools import partial
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+from vllm import envs
+
+from ..video import VIDEO_LOADER_REGISTRY
+from .base import MediaIO
+from .image import ImageMediaIO
+
+
+class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        num_frames: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        self.image_io = image_io
+        self.num_frames = num_frames
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+
+        # Allow per-request override of video backend via kwargs.
+        # This enables users to specify a different backend than the
+        # global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
+        #   --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
+        video_loader_backend = (
+            kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
+        )
+        self.kwargs = kwargs
+        self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
+        return self.video_loader.load_bytes(
+            data, num_frames=self.num_frames, **self.kwargs
+        )
+
+    def load_base64(
+        self, media_type: str, data: str
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        if media_type.lower() == "video/jpeg":
+            load_frame = partial(
+                self.image_io.load_base64,
+                "image/jpeg",
+            )
+
+            return np.stack(
+                [np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
+            ), {}
+
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
+        with filepath.open("rb") as f:
+            data = f.read()
+
+        return self.load_bytes(data)
+
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = "JPEG",
+    ) -> str:
+        video = media
+
+        if video_format == "JPEG":
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+
+            return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)
+
+        msg = "Only JPEG format is supported for now."
+        raise NotImplementedError(msg)