Reduce the image processing latency in VLM (#11541)
Co-authored-by: qiuxuan.lzw <qiuxuan.lzw@alibaba-inc.com>
This commit is contained in:
@@ -221,6 +221,10 @@ class Envs:
|
||||
SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
|
||||
SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
|
||||
|
||||
# VLM
|
||||
SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
|
||||
SGLANG_RESIZE_RESAMPLE = EnvStr("")
|
||||
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
||||
@@ -313,7 +313,9 @@ class BaseMultimodalProcessor(ABC):
|
||||
try:
|
||||
if modality == Modality.IMAGE:
|
||||
img, _ = load_image(data)
|
||||
return img.convert("RGB") if discard_alpha_channel else img
|
||||
if discard_alpha_channel and img.mode != "RGB":
|
||||
img = img.convert("RGB")
|
||||
return img
|
||||
elif modality == Modality.VIDEO:
|
||||
return load_video(data, frame_count_limit)
|
||||
elif modality == Modality.AUDIO:
|
||||
|
||||
@@ -9,6 +9,7 @@ import torchvision
|
||||
from PIL import Image
|
||||
from torchvision.transforms import InterpolationMode
|
||||
|
||||
from sglang.srt.environ import envs
|
||||
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
||||
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
||||
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
||||
@@ -23,8 +24,14 @@ from sglang.utils import logger
|
||||
|
||||
IMAGE_FACTOR = 28
|
||||
MIN_PIXELS = 4 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
|
||||
MAX_RATIO = 200
|
||||
RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
|
||||
if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
|
||||
logger.warning(
|
||||
f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
|
||||
f"Ignoring and using default."
|
||||
)
|
||||
VIDEO_TOTAL_PIXELS = int(
|
||||
float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
|
||||
)
|
||||
@@ -86,7 +93,7 @@ def resize_image(
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
image = image.resize((resized_width, resized_height))
|
||||
image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
|
||||
return image
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user