Reduce the image processing latency in VLM (#11541)

Co-authored-by: qiuxuan.lzw <qiuxuan.lzw@alibaba-inc.com>
2025-10-17 06:00:03 +08:00
parent b0d1d717e1
commit fd389df96e
3 changed files with 16 additions and 3 deletions
--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -221,6 +221,10 @@ class Envs:
    SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
    SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)

+    # VLM
+    SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
+    SGLANG_RESIZE_RESAMPLE = EnvStr("")
+
    # fmt: on


--- a/python/sglang/srt/multimodal/processors/base_processor.py
+++ b/python/sglang/srt/multimodal/processors/base_processor.py
@@ -313,7 +313,9 @@ class BaseMultimodalProcessor(ABC):
        try:
            if modality == Modality.IMAGE:
                img, _ = load_image(data)
-                return img.convert("RGB") if discard_alpha_channel else img
+                if discard_alpha_channel and img.mode != "RGB":
+                    img = img.convert("RGB")
+                return img
            elif modality == Modality.VIDEO:
                return load_video(data, frame_count_limit)
            elif modality == Modality.AUDIO:
--- a/python/sglang/srt/multimodal/processors/qwen_vl.py
+++ b/python/sglang/srt/multimodal/processors/qwen_vl.py
@@ -9,6 +9,7 @@ import torchvision
 from PIL import Image
 from torchvision.transforms import InterpolationMode

+from sglang.srt.environ import envs
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
@@ -23,8 +24,14 @@ from sglang.utils import logger

 IMAGE_FACTOR = 28
 MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
+MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
 MAX_RATIO = 200
+RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
+if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
+    logger.warning(
+        f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
+        f"Ignoring and using default."
+    )
 VIDEO_TOTAL_PIXELS = int(
    float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
 )
@@ -86,7 +93,7 @@ def resize_image(
        min_pixels=min_pixels,
        max_pixels=max_pixels,
    )
-    image = image.resize((resized_width, resized_height))
+    image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
    return image