diff --git a/python/pyproject.toml b/python/pyproject.toml index 86467457a..3d72566f7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -21,6 +21,7 @@ runtime_common = [ "build", "compressed-tensors", "datasets", + "video-reader-rs", "fastapi", "hf_transfer", "huggingface_hub", diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py index 1870e3207..ba42c17be 100644 --- a/python/sglang/check_env.py +++ b/python/sglang/check_env.py @@ -47,7 +47,7 @@ PACKAGE_LIST = [ "tiktoken", "anthropic", "litellm", - "decord", + "video-reader-rs", ] diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index 91aaa1909..7d7784c18 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -206,7 +206,7 @@ class BaseMultimodalProcessor(ABC): estimate the total frame count from all visual input """ # Lazy import because decord is not available on some arm platforms. - from decord import VideoReader, cpu + from video_reader import PyVideoReader, cpu # Before processing inputs if not image_data or len(image_data) == 0: @@ -216,7 +216,7 @@ class BaseMultimodalProcessor(ABC): if isinstance(image, str) and image.startswith("video:"): path = image[len("video:") :] # Estimate frames for the video - vr = VideoReader(path, ctx=cpu(0)) + vr = PyVideoReader(path, threads=0) num_frames = len(vr) else: # For images, each contributes one frame diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py index df9b67aad..4b27a91a3 100644 --- a/python/sglang/srt/multimodal/processors/internvl.py +++ b/python/sglang/srt/multimodal/processors/internvl.py @@ -150,7 +150,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor): def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32): vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) max_frame = len(vr) - 1 - fps = float(vr.get_avg_fps()) + fps = float(vr.get_fps()) pixel_values_list, num_patches_list = [], [] transform = InternVLImageProcessor.build_transform(input_size=input_size) @@ -158,7 +158,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor): bound, fps, max_frame, first_idx=0, num_segments=num_segments ) for frame_index in frame_indices: - img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB") + img = Image.fromarray(vr[frame_index]).convert("RGB") img = InternVLImageProcessor.dynamic_preprocess( img, image_size=input_size, use_thumbnail=True, max_num=max_num ) diff --git a/python/sglang/srt/multimodal/processors/qwen_vl.py b/python/sglang/srt/multimodal/processors/qwen_vl.py index 1ecb4e119..68381dbec 100644 --- a/python/sglang/srt/multimodal/processors/qwen_vl.py +++ b/python/sglang/srt/multimodal/processors/qwen_vl.py @@ -156,10 +156,10 @@ async def preprocess_video( # vr: VideoReader, image_factor: int = IMAGE_FACTOR ) -> torch.Tensor: ele = {} - total_frames, video_fps = len(vr), vr.get_avg_fps() + total_frames, video_fps = len(vr), vr.get_fps() nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps) idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist() - video = vr.get_batch(idx).asnumpy() + video = vr.get_batch(idx) video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format nframes, _, height, width = video.shape min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index ce159a4da..377fa90c8 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -84,6 +84,7 @@ from torch.library import Library from torch.profiler import ProfilerActivity, profile, record_function from torch.utils._contextlib import _DecoratorContextManager from triton.runtime.cache import FileCacheManager +from video_reader import PyVideoReader logger = logging.getLogger(__name__) @@ -757,16 +758,9 @@ def load_image( def load_video(video_file: Union[str, bytes], use_gpu: bool = True): # We import decord here to avoid a strange Segmentation fault (core dumped) issue. - from decord import VideoReader, cpu, gpu - - try: - from decord.bridge import decord_bridge - - ctx = gpu(0) - _ = decord_bridge.get_ctx_device(ctx) - except Exception: - ctx = cpu(0) + from video_reader import PyVideoReader + device = "cuda" if use_gpu and torch.cuda.is_available() else None tmp_file = None vr = None try: @@ -774,7 +768,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True): tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") tmp_file.write(video_file) tmp_file.close() - vr = VideoReader(tmp_file.name, ctx=ctx) + vr = PyVideoReader(tmp_file.name, device=device, threads=0) elif isinstance(video_file, str): if video_file.startswith(("http://", "https://")): timeout = int(os.getenv("REQUEST_TIMEOUT", "10")) @@ -784,22 +778,22 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True): for chunk in response.iter_content(chunk_size=8192): tmp_file.write(chunk) tmp_file.close() - vr = VideoReader(tmp_file.name, ctx=ctx) + vr = PyVideoReader(tmp_file.name, device=device, threads=0) elif video_file.startswith("data:"): _, encoded = video_file.split(",", 1) video_bytes = base64.b64decode(encoded) tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") tmp_file.write(video_bytes) tmp_file.close() - vr = VideoReader(tmp_file.name, ctx=ctx) + vr = PyVideoReader(tmp_file.name, device=device, threads=0) elif os.path.isfile(video_file): - vr = VideoReader(video_file, ctx=ctx) + vr = PyVideoReader(video_file, device=device, threads=0) else: video_bytes = base64.b64decode(video_file) tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") tmp_file.write(video_bytes) tmp_file.close() - vr = VideoReader(tmp_file.name, ctx=ctx) + vr = PyVideoReader(tmp_file.name, device=device, threads=0) else: raise ValueError(f"Unsupported video input type: {type(video_file)}")