diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 7a31365ff..399125805 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -221,6 +221,10 @@ class Envs: SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096) SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256) + # VLM + SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28) + SGLANG_RESIZE_RESAMPLE = EnvStr("") + # fmt: on diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index 91b8ada74..95ee3a486 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -313,7 +313,9 @@ class BaseMultimodalProcessor(ABC): try: if modality == Modality.IMAGE: img, _ = load_image(data) - return img.convert("RGB") if discard_alpha_channel else img + if discard_alpha_channel and img.mode != "RGB": + img = img.convert("RGB") + return img elif modality == Modality.VIDEO: return load_video(data, frame_count_limit) elif modality == Modality.AUDIO: diff --git a/python/sglang/srt/multimodal/processors/qwen_vl.py b/python/sglang/srt/multimodal/processors/qwen_vl.py index b6b899ebd..21787b392 100644 --- a/python/sglang/srt/multimodal/processors/qwen_vl.py +++ b/python/sglang/srt/multimodal/processors/qwen_vl.py @@ -9,6 +9,7 @@ import torchvision from PIL import Image from torchvision.transforms import InterpolationMode +from sglang.srt.environ import envs from sglang.srt.layers.rotary_embedding import MRotaryEmbedding from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration @@ -23,8 +24,14 @@ from sglang.utils import logger IMAGE_FACTOR = 28 MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 +MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get() MAX_RATIO = 200 +RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None) +if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None: + logger.warning( + f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. " + f"Ignoring and using default." + ) VIDEO_TOTAL_PIXELS = int( float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9)) ) @@ -86,7 +93,7 @@ def resize_image( min_pixels=min_pixels, max_pixels=max_pixels, ) - image = image.resize((resized_width, resized_height)) + image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE) return image