fix(internvl): fix accuracy issue of normalization (#10375)
This commit is contained in:
@@ -1,5 +1,7 @@
|
|||||||
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
|
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
|
||||||
|
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torchvision.transforms as T
|
import torchvision.transforms as T
|
||||||
@@ -19,6 +21,20 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|||||||
class InternVLImageProcessor(BaseMultimodalProcessor):
|
class InternVLImageProcessor(BaseMultimodalProcessor):
|
||||||
models = [InternVLChatModel, InternS1ForConditionalGeneration]
|
models = [InternVLChatModel, InternS1ForConditionalGeneration]
|
||||||
|
|
||||||
|
IMAGENET_MEAN = [0.485, 0.456, 0.406]
|
||||||
|
IMAGENET_STD = [0.229, 0.224, 0.225]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _get_normalize_tensors(device="cuda", dtype=torch.float32):
|
||||||
|
mean = torch.tensor(
|
||||||
|
InternVLImageProcessor.IMAGENET_MEAN, device=device, dtype=dtype
|
||||||
|
).view(-1, 1, 1)
|
||||||
|
std = torch.tensor(
|
||||||
|
InternVLImageProcessor.IMAGENET_STD, device=device, dtype=dtype
|
||||||
|
).view(-1, 1, 1)
|
||||||
|
return mean, std
|
||||||
|
|
||||||
def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
|
def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
|
||||||
super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
|
super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
|
||||||
image_size = (
|
image_size = (
|
||||||
@@ -88,6 +104,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|||||||
bound, fps, max_frame, first_idx=0, num_segments=num_segments
|
bound, fps, max_frame, first_idx=0, num_segments=num_segments
|
||||||
)
|
)
|
||||||
|
|
||||||
|
mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
|
||||||
|
|
||||||
for frame_index in frame_indices:
|
for frame_index in frame_indices:
|
||||||
# Load frame
|
# Load frame
|
||||||
frame = vr[frame_index]
|
frame = vr[frame_index]
|
||||||
@@ -97,10 +115,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|||||||
img_np = frame.asnumpy()
|
img_np = frame.asnumpy()
|
||||||
img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
|
img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
|
||||||
|
|
||||||
# Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
|
|
||||||
mean = img.mean(dim=[1, 2], keepdim=True)
|
|
||||||
# Prevent division by zero; clamp to minimum value of 1e-6
|
|
||||||
std = img.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
|
|
||||||
img = (img - mean) / std
|
img = (img - mean) / std
|
||||||
|
|
||||||
tiles = InternVLImageProcessor.dynamic_preprocess(
|
tiles = InternVLImageProcessor.dynamic_preprocess(
|
||||||
@@ -188,6 +202,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|||||||
num_patches_list = []
|
num_patches_list = []
|
||||||
pixel_values = []
|
pixel_values = []
|
||||||
|
|
||||||
|
mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
|
||||||
|
|
||||||
# Process each input with allocated frames
|
# Process each input with allocated frames
|
||||||
for image_index, image in enumerate(base_output.images):
|
for image_index, image in enumerate(base_output.images):
|
||||||
try:
|
try:
|
||||||
@@ -201,10 +217,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|||||||
else:
|
else:
|
||||||
tensor = image.cuda() # assume already tensor
|
tensor = image.cuda() # assume already tensor
|
||||||
|
|
||||||
# Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
|
|
||||||
mean = tensor.mean(dim=[1, 2], keepdim=True)
|
|
||||||
# Prevent division by zero; clamp to minimum value of 1e-6
|
|
||||||
std = tensor.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
|
|
||||||
tensor = (tensor - mean) / std
|
tensor = (tensor - mean) / std
|
||||||
tiles = self.dynamic_preprocess(
|
tiles = self.dynamic_preprocess(
|
||||||
tensor, image_size=448, max_num=12, use_thumbnail=True
|
tensor, image_size=448, max_num=12, use_thumbnail=True
|
||||||
|
|||||||
Reference in New Issue
Block a user