Reduce the image processing latency in VLM (#11541)
Co-authored-by: qiuxuan.lzw <qiuxuan.lzw@alibaba-inc.com>
This commit is contained in:
@@ -221,6 +221,10 @@ class Envs:
|
|||||||
SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
|
SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
|
||||||
SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
|
SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
|
||||||
|
|
||||||
|
# VLM
|
||||||
|
SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
|
||||||
|
SGLANG_RESIZE_RESAMPLE = EnvStr("")
|
||||||
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -313,7 +313,9 @@ class BaseMultimodalProcessor(ABC):
|
|||||||
try:
|
try:
|
||||||
if modality == Modality.IMAGE:
|
if modality == Modality.IMAGE:
|
||||||
img, _ = load_image(data)
|
img, _ = load_image(data)
|
||||||
return img.convert("RGB") if discard_alpha_channel else img
|
if discard_alpha_channel and img.mode != "RGB":
|
||||||
|
img = img.convert("RGB")
|
||||||
|
return img
|
||||||
elif modality == Modality.VIDEO:
|
elif modality == Modality.VIDEO:
|
||||||
return load_video(data, frame_count_limit)
|
return load_video(data, frame_count_limit)
|
||||||
elif modality == Modality.AUDIO:
|
elif modality == Modality.AUDIO:
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import torchvision
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from torchvision.transforms import InterpolationMode
|
from torchvision.transforms import InterpolationMode
|
||||||
|
|
||||||
|
from sglang.srt.environ import envs
|
||||||
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
||||||
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
||||||
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
||||||
@@ -23,8 +24,14 @@ from sglang.utils import logger
|
|||||||
|
|
||||||
IMAGE_FACTOR = 28
|
IMAGE_FACTOR = 28
|
||||||
MIN_PIXELS = 4 * 28 * 28
|
MIN_PIXELS = 4 * 28 * 28
|
||||||
MAX_PIXELS = 16384 * 28 * 28
|
MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
|
||||||
MAX_RATIO = 200
|
MAX_RATIO = 200
|
||||||
|
RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
|
||||||
|
if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
|
||||||
|
logger.warning(
|
||||||
|
f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
|
||||||
|
f"Ignoring and using default."
|
||||||
|
)
|
||||||
VIDEO_TOTAL_PIXELS = int(
|
VIDEO_TOTAL_PIXELS = int(
|
||||||
float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
|
float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
|
||||||
)
|
)
|
||||||
@@ -86,7 +93,7 @@ def resize_image(
|
|||||||
min_pixels=min_pixels,
|
min_pixels=min_pixels,
|
||||||
max_pixels=max_pixels,
|
max_pixels=max_pixels,
|
||||||
)
|
)
|
||||||
image = image.resize((resized_width, resized_height))
|
image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
|
||||||
return image
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user