diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py
index 9c20664d6..c9a2d97ef 100644
--- a/python/sglang/srt/multimodal/processors/internvl.py
+++ b/python/sglang/srt/multimodal/processors/internvl.py
@@ -1,5 +1,7 @@
 # Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
 
+from functools import lru_cache
+
 import numpy as np
 import torch
 import torchvision.transforms as T
@@ -19,6 +21,20 @@ from sglang.srt.multimodal.processors.base_processor import (
 class InternVLImageProcessor(BaseMultimodalProcessor):
     models = [InternVLChatModel, InternS1ForConditionalGeneration]
 
+    IMAGENET_MEAN = [0.485, 0.456, 0.406]
+    IMAGENET_STD = [0.229, 0.224, 0.225]
+
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def _get_normalize_tensors(device="cuda", dtype=torch.float32):
+        mean = torch.tensor(
+            InternVLImageProcessor.IMAGENET_MEAN, device=device, dtype=dtype
+        ).view(-1, 1, 1)
+        std = torch.tensor(
+            InternVLImageProcessor.IMAGENET_STD, device=device, dtype=dtype
+        ).view(-1, 1, 1)
+        return mean, std
+
     def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
         super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
         image_size = (
@@ -88,6 +104,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             bound, fps, max_frame, first_idx=0, num_segments=num_segments
         )
 
+        mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
+
         for frame_index in frame_indices:
             # Load frame
             frame = vr[frame_index]
@@ -97,10 +115,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
                 img_np = frame.asnumpy()
                 img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
 
-            # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
-            mean = img.mean(dim=[1, 2], keepdim=True)
-            # Prevent division by zero; clamp to minimum value of 1e-6
-            std = img.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
             img = (img - mean) / std
 
             tiles = InternVLImageProcessor.dynamic_preprocess(
@@ -188,6 +202,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         num_patches_list = []
         pixel_values = []
 
+        mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
+
         # Process each input with allocated frames
         for image_index, image in enumerate(base_output.images):
             try:
@@ -201,10 +217,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
                 else:
                     tensor = image.cuda()  # assume already tensor
 
-                # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
-                mean = tensor.mean(dim=[1, 2], keepdim=True)
-                # Prevent division by zero; clamp to minimum value of 1e-6
-                std = tensor.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
                 tensor = (tensor - mean) / std
                 tiles = self.dynamic_preprocess(
                     tensor, image_size=448, max_num=12, use_thumbnail=True