diff --git a/vllm_kunlun/models/internvl.py b/vllm_kunlun/models/internvl.py
index de5eece..24602da 100644
--- a/vllm_kunlun/models/internvl.py
+++ b/vllm_kunlun/models/internvl.py
@@ -13,6 +13,7 @@ from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Any, Literal, Optional, TypeVar, Union
 
 import numpy.typing as npt
+import numpy as np
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
@@ -297,6 +298,8 @@ def video_to_pixel_values_internvl(
     transform = build_transform(input_size=input_size)
     frames_list = list[Image.Image]()
     for frame in video:
+        if frame.dtype != np.uint8:
+            frame = frame.astype(np.uint8)
         pil_frame = dynamic_preprocess_internvl(
             Image.fromarray(frame, mode="RGB"),
             target_ratios=target_ratios,
@@ -1420,4 +1423,4 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="mlp1",
-            tower_model="vision_model")
\ No newline at end of file
+            tower_model="vision_model")