diff --git a/python/pyproject.toml b/python/pyproject.toml index c538c4bcb..7afb3581a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -21,7 +21,6 @@ runtime_common = [ "build", "compressed-tensors", "datasets", - "video-reader-rs", "fastapi", "hf_transfer", "huggingface_hub", diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py index ba42c17be..1870e3207 100644 --- a/python/sglang/check_env.py +++ b/python/sglang/check_env.py @@ -47,7 +47,7 @@ PACKAGE_LIST = [ "tiktoken", "anthropic", "litellm", - "video-reader-rs", + "decord", ] diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index 7d7784c18..91aaa1909 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -206,7 +206,7 @@ class BaseMultimodalProcessor(ABC): estimate the total frame count from all visual input """ # Lazy import because decord is not available on some arm platforms. - from video_reader import PyVideoReader, cpu + from decord import VideoReader, cpu # Before processing inputs if not image_data or len(image_data) == 0: @@ -216,7 +216,7 @@ class BaseMultimodalProcessor(ABC): if isinstance(image, str) and image.startswith("video:"): path = image[len("video:") :] # Estimate frames for the video - vr = PyVideoReader(path, threads=0) + vr = VideoReader(path, ctx=cpu(0)) num_frames = len(vr) else: # For images, each contributes one frame diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py index 4b27a91a3..df9b67aad 100644 --- a/python/sglang/srt/multimodal/processors/internvl.py +++ b/python/sglang/srt/multimodal/processors/internvl.py @@ -150,7 +150,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor): def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32): vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) max_frame = len(vr) - 1 - fps = float(vr.get_fps()) + fps = float(vr.get_avg_fps()) pixel_values_list, num_patches_list = [], [] transform = InternVLImageProcessor.build_transform(input_size=input_size) @@ -158,7 +158,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor): bound, fps, max_frame, first_idx=0, num_segments=num_segments ) for frame_index in frame_indices: - img = Image.fromarray(vr[frame_index]).convert("RGB") + img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB") img = InternVLImageProcessor.dynamic_preprocess( img, image_size=input_size, use_thumbnail=True, max_num=max_num ) diff --git a/python/sglang/srt/multimodal/processors/qwen_vl.py b/python/sglang/srt/multimodal/processors/qwen_vl.py index 68381dbec..1ecb4e119 100644 --- a/python/sglang/srt/multimodal/processors/qwen_vl.py +++ b/python/sglang/srt/multimodal/processors/qwen_vl.py @@ -156,10 +156,10 @@ async def preprocess_video( # vr: VideoReader, image_factor: int = IMAGE_FACTOR ) -> torch.Tensor: ele = {} - total_frames, video_fps = len(vr), vr.get_fps() + total_frames, video_fps = len(vr), vr.get_avg_fps() nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps) idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist() - video = vr.get_batch(idx) + video = vr.get_batch(idx).asnumpy() video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format nframes, _, height, width = video.shape min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index d055aab5b..37e06b8dc 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -84,7 +84,6 @@ from torch.library import Library from torch.profiler import ProfilerActivity, profile, record_function from torch.utils._contextlib import _DecoratorContextManager from triton.runtime.cache import FileCacheManager -from video_reader import PyVideoReader logger = logging.getLogger(__name__) @@ -758,9 +757,16 @@ def load_image( def load_video(video_file: Union[str, bytes], use_gpu: bool = True): # We import decord here to avoid a strange Segmentation fault (core dumped) issue. - from video_reader import PyVideoReader + from decord import VideoReader, cpu, gpu + + try: + from decord.bridge import decord_bridge + + ctx = gpu(0) + _ = decord_bridge.get_ctx_device(ctx) + except Exception: + ctx = cpu(0) - device = "cuda" if use_gpu and torch.cuda.is_available() else None tmp_file = None vr = None try: @@ -768,7 +774,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True): tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") tmp_file.write(video_file) tmp_file.close() - vr = PyVideoReader(tmp_file.name, device=device, threads=0) + vr = VideoReader(tmp_file.name, ctx=ctx) elif isinstance(video_file, str): if video_file.startswith(("http://", "https://")): timeout = int(os.getenv("REQUEST_TIMEOUT", "10")) @@ -778,22 +784,22 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True): for chunk in response.iter_content(chunk_size=8192): tmp_file.write(chunk) tmp_file.close() - vr = PyVideoReader(tmp_file.name, device=device, threads=0) + vr = VideoReader(tmp_file.name, ctx=ctx) elif video_file.startswith("data:"): _, encoded = video_file.split(",", 1) video_bytes = base64.b64decode(encoded) tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") tmp_file.write(video_bytes) tmp_file.close() - vr = PyVideoReader(tmp_file.name, device=device, threads=0) + vr = VideoReader(tmp_file.name, ctx=ctx) elif os.path.isfile(video_file): - vr = PyVideoReader(video_file, device=device, threads=0) + vr = VideoReader(video_file, ctx=ctx) else: video_bytes = base64.b64decode(video_file) tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") tmp_file.write(video_bytes) tmp_file.close() - vr = PyVideoReader(tmp_file.name, device=device, threads=0) + vr = VideoReader(tmp_file.name, ctx=ctx) else: raise ValueError(f"Unsupported video input type: {type(video_file)}")