[Feat/WIP] add llava-onevision, with support for (1) siglip encoder, (2) qwen2 decoder (3) openai api compatible server. (#1123)

Co-authored-by: Bo Li <drluodian@gmail.com>
2024-08-24 05:11:16 +08:00
parent 5fafcac008
commit a5b14ad043
13 changed files with 703 additions and 95 deletions
--- a/README.md
+++ b/README.md
@@ -231,8 +231,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
  - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --host=127.0.0.1 --tp-size=1 --chat-template=llava_llama_3`
  - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --host="127.0.0.1" --tp-size=8 --chat-template=chatml-llava`
 - LLaVA-NeXT-Video
  - see [examples/usage/llava_video](examples/usage/llava_video)
 - [LLaVA-OneVision](https://arxiv.org/abs/2408.03326)
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
  - see [test/srt/test_llava_onevision_openai_server.py](test/srt/test_llava_onevision_openai_server.py)
 - Yi-VL
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
 - StableLM
--- a/examples/usage/llava/http_llava_onevision_test.py
+++ b/examples/usage/llava/http_llava_onevision_test.py
@@ -0,0 +1,211 @@
 import base64
 import io
 import os
 import sys
 import time
 import numpy as np
 import openai
 import requests
 from decord import VideoReader, cpu
 from PIL import Image
 # pip install httpx==0.23.3
 # pip install decord
 # pip install protobuf==3.20.0
 def download_video(url, cache_dir):
    file_path = os.path.join(cache_dir, "jobs.mp4")
    os.makedirs(cache_dir, exist_ok=True)
    response = requests.get(url)
    response.raise_for_status()
    with open(file_path, "wb") as f:
        f.write(response.content)
    print(f"File downloaded and saved to: {file_path}")
    return file_path
 def create_openai_client(base_url):
    return openai.Client(api_key="EMPTY", base_url=base_url)
 def image_stream_request_test(client):
    print("----------------------Image Stream Request Test----------------------")
    stream_request = client.chat.completions.create(
        model="default",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
                        },
                    },
                    {
                        "type": "text",
                        "text": "Please describe this image. Please list the benchmarks and the models.",
                    },
                ],
            },
        ],
        temperature=0.7,
        max_tokens=1024,
        stream=True,
    )
    stream_response = ""
    for chunk in stream_request:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            stream_response += content
            sys.stdout.write(content)
            sys.stdout.flush()
    print("-" * 30)
 def video_stream_request_test(client, video_path):
    print("------------------------Video Stream Request Test----------------------")
    messages = prepare_video_messages(video_path)
    start_time = time.time()
    video_request = client.chat.completions.create(
        model="default",
        messages=messages,
        temperature=0,
        max_tokens=1024,
        stream=True,
    )
    print("-" * 30)
    video_response = ""
    for chunk in video_request:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            video_response += content
            sys.stdout.write(content)
            sys.stdout.flush()
    print("-" * 30)
 def image_speed_test(client):
    print("----------------------Image Speed Test----------------------")
    start_time = time.time()
    request = client.chat.completions.create(
        model="default",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
                        },
                    },
                    {
                        "type": "text",
                        "text": "Please describe this image. Please list the benchmarks and the models.",
                    },
                ],
            },
        ],
        temperature=0,
        max_tokens=1024,
    )
    end_time = time.time()
    response = request.choices[0].message.content
    print(response)
    print("-" * 30)
    print_speed_test_results(request, start_time, end_time)
 def video_speed_test(client, video_path):
    print("------------------------Video Speed Test------------------------")
    messages = prepare_video_messages(video_path)
    start_time = time.time()
    video_request = client.chat.completions.create(
        model="default",
        messages=messages,
        temperature=0,
        max_tokens=1024,
    )
    end_time = time.time()
    video_response = video_request.choices[0].message.content
    print(video_response)
    print("-" * 30)
    print_speed_test_results(video_request, start_time, end_time)
 def prepare_video_messages(video_path):
    max_frames_num = 32
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frame_num = len(vr)
    uniform_sampled_frames = np.linspace(
        0, total_frame_num - 1, max_frames_num, dtype=int
    )
    frame_idx = uniform_sampled_frames.tolist()
    frames = vr.get_batch(frame_idx).asnumpy()
    base64_frames = []
    for frame in frames:
        pil_img = Image.fromarray(frame)
        buff = io.BytesIO()
        pil_img.save(buff, format="JPEG")
        base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
        base64_frames.append(base64_str)
    messages = [{"role": "user", "content": []}]
    frame_format = {
        "type": "image_url",
        "image_url": {"url": "data:image/jpeg;base64,{}"},
    }
    for base64_frame in base64_frames:
        frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format(
            base64_frame
        )
        messages[0]["content"].append(frame_format.copy())
    prompt = {"type": "text", "text": "Please describe the video in detail."}
    messages[0]["content"].append(prompt)
    return messages
 def print_speed_test_results(request, start_time, end_time):
    total_tokens = request.usage.total_tokens
    completion_tokens = request.usage.completion_tokens
    prompt_tokens = request.usage.prompt_tokens
    print(f"Total tokens: {total_tokens}")
    print(f"Completion tokens: {completion_tokens}")
    print(f"Prompt tokens: {prompt_tokens}")
    print(f"Time taken: {end_time - start_time} seconds")
    print(f"Token per second: {total_tokens / (end_time - start_time)}")
    print(f"Completion token per second: {completion_tokens / (end_time - start_time)}")
    print(f"Prompt token per second: {prompt_tokens / (end_time - start_time)}")
 def main():
    url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
    cache_dir = os.path.expanduser("~/.cache")
    video_path = download_video(url, cache_dir)
    client = create_openai_client("http://127.0.0.1:30000/v1")
    image_stream_request_test(client)
    video_stream_request_test(client, video_path)
    image_speed_test(client)
    video_speed_test(client, video_path)
 if __name__ == "__main__":
    main()
--- a/examples/usage/llava_video/srt_example_llava_v.py
+++ b/examples/usage/llava_video/srt_example_llava_v.py
@@ -121,6 +121,20 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=
 if __name__ == "__main__":
    url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
    cache_dir = os.path.expanduser("~/.cache")
    file_path = os.path.join(cache_dir, "jobs.mp4")
    os.makedirs(cache_dir, exist_ok=True)
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad responses
    with open(file_path, "wb") as f:
        f.write(response.content)
    print(f"File downloaded and saved to: {file_path}")
    # Create the parser
    parser = argparse.ArgumentParser(
        description="Run video processing with specified port."
@@ -148,7 +162,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--video-dir",
        type=str,
-        default="./videos/Q98Z4OTh8RwmDonc.mp4",
+        default=os.path.expanduser("~/.cache/jobs.mp4"),
        help="The directory or path for the processed video files.",
    )
    parser.add_argument(
--- a/examples/usage/llava_video/videos/Q98Z4OTh8RwmDonc.mp4
+++ b/examples/usage/llava_video/videos/Q98Z4OTh8RwmDonc.mp4
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
 ]
 [project.optional-dependencies]
-srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
+srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
       "packaging", "pillow", "psutil", "pydantic", "python-multipart",
       "torch", "uvicorn", "uvloop", "zmq",
       "vllm==0.5.4", "outlines>=0.0.44"]
--- a/python/sglang/lang/chat_template.py
+++ b/python/sglang/lang/chat_template.py
@@ -137,7 +137,7 @@ register_chat_template(
 register_chat_template(
    ChatTemplate(
        name="chatml-llava",
-        default_system_prompt="Answer the questions.",
+        default_system_prompt="You are a helpful assistant.",
        role_prefix_and_suffix={
            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
@@ -145,7 +145,7 @@ register_chat_template(
        },
        style=ChatTemplateStyle.PLAIN,
        stop_str=("<|im_end|>",),
-        image_token=" <image>\n",
+        image_token="<image>\n",
    )
 )
@@ -322,12 +322,17 @@ def match_chat_ml(model_path: str):
    if "tinyllama" in model_path:
        return get_chat_template("chatml")
    # Now the suffix for qwen2 chat model is "instruct"
-    if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path):
+    if (
        "qwen" in model_path
        and ("chat" in model_path or "instruct" in model_path)
        and ("llava" not in model_path)
    ):
        return get_chat_template("qwen")
    if (
        "llava-v1.6-34b" in model_path
        or "llava-v1.6-yi-34b" in model_path
        or "llava-next-video-34b" in model_path
        or "llava-onevision-qwen2" in model_path
    ):
        return get_chat_template("chatml-llava")
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -34,6 +34,7 @@ class SeparatorStyle(IntEnum):
    NO_COLON_TWO = auto()
    ADD_NEW_LINE_SINGLE = auto()
    LLAMA2 = auto()
    LLAMA3 = auto()
    CHATGLM = auto()
    CHATML = auto()
    CHATINTERN = auto()
@@ -137,6 +138,20 @@ class Conversation:
                else:
                    ret += role + ":"
            return ret
        elif self.sep_style == SeparatorStyle.LLAMA3:
            ret = "<|begin_of_text|>"
            if self.system_message:
                ret += system_prompt
            else:
                ret += ""
            for i, (role, message) in enumerate(self.messages):
                if message:
                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
                    ret += f"{message.strip()}<|eot_id|>"
                else:
                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
            # print(ret)
            return ret
        elif self.sep_style == SeparatorStyle.LLAMA2:
            seps = [self.sep, self.sep2]
            if self.system_message:
@@ -379,12 +394,23 @@ def generate_chat_conv(
                conv.append_message(conv.roles[0], message.content)
            else:
                real_content = ""
                # calculate number of image_url
                num_image_url = 0
                for content in message.content:
                    if content.type == "image_url":
                        num_image_url += 1
                if num_image_url > 1:
                    image_token = "<image>"
                else:
                    image_token = "<image>\n"
                for content in message.content:
                    if content.type == "text":
                        if num_image_url > 16:
                            real_content += "\n"  # for video
                        real_content += content.text
                    elif content.type == "image_url":
                        # NOTE: Only works for llava
-                        real_content += "<image>\n"
+                        real_content += image_token
                        conv.append_image(content.image_url.url)
                conv.append_message(conv.roles[0], real_content)
        elif msg_role == "assistant":
@@ -425,6 +451,18 @@ register_conv_template(
    )
 )
 register_conv_template(
    Conversation(
        name="chatml-llava",
        system_template="<|im_start|>system\n{system_message}",
        system_message="You are a helpful assistant.",
        roles=("<|im_start|>user", "<|im_start|>assistant"),
        sep_style=SeparatorStyle.CHATML,
        sep="<|im_end|>",
        stop_str=["<|endoftext|>", "<|im_end|>"],
    )
 )
 register_conv_template(
    Conversation(
        name="vicuna_v1.1",
@@ -437,6 +475,17 @@ register_conv_template(
    )
 )
 register_conv_template(
    Conversation(
        name="llava_llama_3",
        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
        roles=("user", "assistant"),
        sep_style=SeparatorStyle.LLAMA3,
        sep="",
        stop_str=["<|end_of_text|>", "<|eot_id|>"],
    )
 )
 # Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
 register_conv_template(
    Conversation(
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -131,11 +131,49 @@ class TokenizerManager:
        self.model_update_lock = asyncio.Lock()
        self.model_update_result = None
-    async def get_pixel_values(self, image_data):
+    async def get_pixel_values(self, image_data, aspect_ratio=None):
-        aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
+        aspect_ratio = (
-        grid_pinpoints = (
+            getattr(self.hf_config, "image_aspect_ratio", None)
-            self.hf_config.image_grid_pinpoints if aspect_ratio == "anyres" else None
+            if aspect_ratio is None
            else aspect_ratio
        )
        grid_pinpoints = (
            self.hf_config.image_grid_pinpoints
            if hasattr(self.hf_config, "image_grid_pinpoints")
            and "anyres" in aspect_ratio
            else None
        )
        if isinstance(image_data, list) and len(image_data) > 0:
            pixel_values, image_hash, image_size = [], [], []
            if len(image_data) > 1:
                aspect_ratio = "pad"  # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
                for img_data in image_data:
                    pixel_v, image_h, image_s = await self._process_single_image(
                        img_data, aspect_ratio, grid_pinpoints
                    )
                    pixel_values.append(pixel_v)
                    image_hash.append(image_h)
                    image_size.append(image_s)
                pixel_values = np.stack(pixel_values, axis=0)
            else:
                pixel_values, image_hash, image_size = await self._process_single_image(
                    image_data[0], aspect_ratio, grid_pinpoints
                )
                image_hash = [image_hash]
                image_size = [image_size]
        elif isinstance(image_data, str):
            pixel_values, image_hash, image_size = await self._process_single_image(
                image_data, aspect_ratio, grid_pinpoints
            )
            image_hash = [image_hash]
            image_size = [image_size]
        else:
            pixel_values, image_hash, image_size = None, None, None
        return pixel_values, image_hash, image_size
    async def _process_single_image(self, image_data, aspect_ratio, grid_pinpoints):
        if self.executor is not None:
            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(
@@ -194,8 +232,8 @@ class TokenizerManager:
            )
            if self.is_generation:
-                pixel_values, image_hash, image_size = await self._get_pixel_values(
+                pixel_values, image_hash, image_size = await self.get_pixel_values(
-                    obj.image_data if not_use_index else obj.image_data[index]
+                    obj.image_data
                )
                return_logprob = (
                    obj.return_logprob if not_use_index else obj.return_logprob[index]
@@ -704,7 +742,7 @@ def get_pixel_values(
                    tuple(int(x * 255) for x in processor.image_processor.image_mean),
                )
                pixel_values = processor.image_processor(image)["pixel_values"][0]
-            elif image_aspect_ratio == "anyres":
+            elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
                pixel_values = process_anyres_image(
                    image, processor.image_processor, image_grid_pinpoints
                )
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -322,11 +322,16 @@ class ModelTpServer:
        if self.model_runner.is_generation:
            req.pixel_values = recv_req.pixel_values
            if req.pixel_values is not None:
                image_hash = (
                    hash(tuple(recv_req.image_hash))
                    if isinstance(recv_req.image_hash, list)
                    else recv_req.image_hash
                )
                req.pad_value = [
-                    (recv_req.image_hash) % self.model_config.vocab_size,
+                    (image_hash) % self.model_config.vocab_size,
-                    (recv_req.image_hash >> 16) % self.model_config.vocab_size,
+                    (image_hash >> 16) % self.model_config.vocab_size,
-                    (recv_req.image_hash >> 32) % self.model_config.vocab_size,
+                    (image_hash >> 32) % self.model_config.vocab_size,
-                    (recv_req.image_hash >> 64) % self.model_config.vocab_size,
+                    (image_hash >> 64) % self.model_config.vocab_size,
                ]
                req.image_size = recv_req.image_size
                (
--- a/python/sglang/srt/mm_utils.py
+++ b/python/sglang/srt/mm_utils.py
@@ -13,10 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-# Source: https://github.com/haotian-liu/LLaVA/blob/main/llava/mm_utils.py
+# Source: https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/llava/mm_utils.py
 """
 Utilities for multi-modal models.
 This python file mainly contains utilities that were used in the 
 image processing logic of llava-next including operations such as
 anyres and anyres_max
 Currently supports the anyres and anyres_max operation for CLIP and
 SigLip. For more information, you may refer to the paper or the blog
 LLaVA-NeXT : https://llava-vl.github.io/blog/2024-01-30-llava-next/
 LLaVA-Onevision : https://arxiv.org/pdf/2408.03326
 """
 import ast
 import base64
 import math
 import re
 from io import BytesIO
 import numpy as np
@@ -40,10 +55,13 @@ def select_best_resolution(original_size, possible_resolutions):
    min_wasted_resolution = float("inf")
    for width, height in possible_resolutions:
        # Calculate the downscaled size to keep the aspect ratio
        scale = min(width / original_width, height / original_height)
        downscaled_width, downscaled_height = int(original_width * scale), int(
            original_height * scale
        )
        # Calculate effective and wasted resolutions
        effective_resolution = min(
            downscaled_width * downscaled_height, original_width * original_height
        )
@@ -129,6 +147,26 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
    Returns:
        tuple: The shape of the image patch grid in the format (width, height).
    """
    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
        assert patch_size in [
            224,
            336,
            384,
            448,
            512,
        ], "patch_size should be in [224, 336, 384, 448, 512]"
        # Use regex to extract the range from the input string
        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
        range_start = tuple(map(int, matches[0]))
        range_end = tuple(map(int, matches[-1]))
        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
        grid_pinpoints = [
            (i, j)
            for i in range(range_start[0], range_end[0] + 1)
            for j in range(range_start[1], range_end[1] + 1)
        ]
        # Multiply all elements by patch_size
        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
    if type(grid_pinpoints) is list:
        possible_resolutions = grid_pinpoints
    else:
@@ -149,6 +187,31 @@ def process_anyres_image(image, processor, grid_pinpoints):
    Returns:
        np.array: An np array containing the processed image patches.
    """
    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
        try:
            patch_size = processor.size[0]
        except Exception as e:
            patch_size = processor.size["shortest_edge"]
        assert patch_size in [
            224,
            336,
            384,
            448,
            512,
        ], "patch_size should be in [224, 336, 384, 448, 512]"
        # Use regex to extract the range from the input string
        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
        range_start = tuple(map(int, matches[0]))
        range_end = tuple(map(int, matches[-1]))
        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
        grid_pinpoints = [
            (i, j)
            for i in range(range_start[0], range_end[0] + 1)
            for j in range(range_start[1], range_end[1] + 1)
        ]
        # Multiply all elements by patch_size
        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
    if type(grid_pinpoints) is list:
        possible_resolutions = grid_pinpoints
    else:
@@ -156,15 +219,24 @@ def process_anyres_image(image, processor, grid_pinpoints):
    best_resolution = select_best_resolution(image.size, possible_resolutions)
    image_padded = resize_and_pad_image(image, best_resolution)
-    patches = divide_to_patches(image_padded, processor.crop_size["height"])
+    # For Siglip processor, only have size but no crop size
-
+    crop_size = (
-    image_original_resize = image.resize(
+        processor.crop_size["height"]
-        (processor.size["shortest_edge"], processor.size["shortest_edge"])
+        if "crop_size" in processor.__dict__
        else processor.size["height"]
    )
    shortest_edge = (
        processor.size["shortest_edge"]
        if "shortest_edge" in processor.size
        else processor.size["height"]
    )
    patches = divide_to_patches(image_padded, crop_size)
    image_original_resize = image.resize((shortest_edge, shortest_edge))
    image_patches = [image_original_resize] + patches
    image_patches = [
-        processor.preprocess(image_patch)["pixel_values"][0]
+        processor.preprocess(image_patch.convert("RGB"))["pixel_values"][0]
        for image_patch in image_patches
    ]
    return np.stack(image_patches, axis=0)
@@ -255,7 +327,7 @@ def process_images(images, image_processor, model_cfg):
            )
            image = image_processor.preprocess(image)["pixel_values"][0]
            new_images.append(image)
-    elif image_aspect_ratio == "anyres":
+    elif "anyres" in image_aspect_ratio:
        for image in images:
            image = process_anyres_image(
                image, image_processor, model_cfg.image_grid_pinpoints
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -88,14 +88,19 @@ class InputMetadata:
        reqs = batch.reqs
        self.pixel_values = [r.pixel_values for r in reqs]
        self.image_sizes = [r.image_size for r in reqs]
-        self.image_offsets = [
+        self.image_offsets = []
-            (
+        for r in reqs:
-                (r.image_offset - batch.prefix_lens_cpu[i])
+            if isinstance(r.image_offset, list):
-                if r.image_offset is not None
+                self.image_offsets.append(
-                else 0
+                    [
-            )
+                        (image_offset - len(r.prefix_indices))
-            for i, r in enumerate(reqs)
+                        for image_offset in r.image_offset
-        ]
+                    ]
                )
            elif isinstance(r.image_offset, int):
                self.image_offsets.append(r.image_offset - len(r.prefix_indices))
            elif r.image_offset is None:
                self.image_offsets.append(0)
    def compute_positions(self, batch: ScheduleBatch):
        position_ids_offsets = batch.position_ids_offsets
--- a/python/sglang/srt/models/llava.py
+++ b/python/sglang/srt/models/llava.py
@@ -15,6 +15,8 @@ limitations under the License.
 """Inference-only LLaVa model compatible with HuggingFace weights."""
 import math
 import re
 from typing import Iterable, List, Optional, Tuple
 import numpy as np
@@ -26,6 +28,8 @@ from transformers import (
    LlavaConfig,
    MistralConfig,
    Qwen2Config,
    SiglipVisionConfig,
    SiglipVisionModel,
 )
 from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
 from vllm.config import CacheConfig
@@ -63,34 +67,61 @@ class LlavaLlamaForCausalLM(nn.Module):
            )
    def pad_input_ids(self, input_ids, pad_value, pt_shape=None, image_size=None):
        new_image_feature_len = self.image_feature_len
        # now only support spatial_unpad + anyres
        if self.mm_patch_merge_type.startswith("spatial"):
            height = width = self.num_patches_per_side
            if pt_shape[0] > 1:
                if self.image_aspect_ratio == "anyres":
                    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
                        image_size,
                        self.image_grid_pinpoints,
                        self.vision_tower.config.image_size,
                    )
                if "unpad" in self.mm_patch_merge_type:
                    h = num_patch_height * height
                    w = num_patch_width * width
                    new_h, new_w = unpad_image_shape(h, w, image_size)
                    new_image_feature_len += new_h * (new_w + 1)
-        pad_ids = pad_value * (
+        # hardcode for spatial_unpad + anyres
-            (new_image_feature_len + len(pad_value)) // len(pad_value)
+        image_aspect_ratio = "anyres" if len(image_size) == 1 else "pad"
-        )
+        offset_list = []
-        offset = input_ids.index(self.config.image_token_index)
+        for image_s in image_size:
-        # old_len + pad_len - 1, because we need to remove image_token_id
+            if len(image_size) > 16:
-        new_input_ids = (
+                # 2x2 pooling with stride 2
-            input_ids[:offset]
+                new_image_feature_len = (
-            + pad_ids[:new_image_feature_len]
+                    math.ceil(self.image_size / self.patch_size / 2) ** 2
-            + input_ids[offset + 1 :]
+                )
-        )
+            else:
-        return new_input_ids, offset
+                new_image_feature_len = self.image_feature_len  # multiimage
            height = width = self.num_patches_per_side
            if "anyres" in image_aspect_ratio:
                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
                    image_s,
                    self.image_grid_pinpoints,
                    self.vision_tower.config.image_size,
                )
                h = num_patch_height * height
                w = num_patch_width * width
                new_h, new_w = unpad_image_shape(h, w, image_s)
                if "anyres_max" in self.config.image_aspect_ratio:
                    matched_anyres_max_num_patches = re.match(
                        r"anyres_max_(\d+)", self.config.image_aspect_ratio
                    )
                    if matched_anyres_max_num_patches:
                        max_num_patches = int(matched_anyres_max_num_patches.group(1))
                    # times = math.sqrt(h * w / (max_num_patches * unit**2))
                    times = math.sqrt(
                        new_h * new_w / (max_num_patches * self.image_feature_len)
                    )
                    if times > 1.1:
                        new_h = int(new_h // times)
                        new_w = int(new_w // times)
                new_image_feature_len += new_h * (new_w + 1)
            pad_ids = pad_value * (
                (new_image_feature_len + len(pad_value)) // len(pad_value)
            )
            # print("calculated new_image_feature_len: ", new_image_feature_len)
            try:
                offset = input_ids.index(self.config.image_token_index)
            except ValueError:
                offset = 0
            # old_len + pad_len - 1, because we need to remove image_token_id
            input_ids = (
                input_ids[:offset]
                + pad_ids[:new_image_feature_len]
                + input_ids[offset + 1 :]
            )
            offset_list.append(offset)
        return input_ids, offset_list
    def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor:
        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
@@ -124,7 +155,6 @@ class LlavaLlamaForCausalLM(nn.Module):
            # Embed text input
            input_embeds = self.language_model.model.embed_tokens(input_ids)
            # Embed vision input
            need_vision = (
                (positions[input_metadata.extend_start_loc] < self.image_feature_len)
@@ -163,27 +193,73 @@ class LlavaLlamaForCausalLM(nn.Module):
                if self.mm_patch_merge_type.startswith("spatial"):
                    new_image_features = []
                    height = width = self.num_patches_per_side
                    for image_idx, image_feature in enumerate(image_features):
-                        if image_feature.shape[0] > 1:
+                        if len(image_sizes[image_idx]) == 1:
                            image_aspect_ratio = (
                                self.config.image_aspect_ratio
                            )  # single image
                        else:
                            image_aspect_ratio = "pad"  # multi image
                        # image_aspect_ratio = (
                        #     "anyres" if len(image_sizes[image_idx]) == 1 else "pad"
                        # )
                        if (
                            image_feature.shape[0] > 1
                            and "anyres" in image_aspect_ratio
                        ):
                            base_image_feature = image_feature[0]
                            image_feature = image_feature[1:]
                            height = width = self.num_patches_per_side
                            assert height * width == base_image_feature.shape[0]
-                            if self.image_aspect_ratio == "anyres":
+
-                                (
+                            if "anyres_max" in image_aspect_ratio:
-                                    num_patch_width,
+                                matched_anyres_max_num_patches = re.match(
-                                    num_patch_height,
+                                    r"anyres_max_(\d+)", image_aspect_ratio
                                ) = get_anyres_image_grid_shape(
                                    image_sizes[image_idx],
                                    self.image_grid_pinpoints,
                                    self.vision_tower.config.image_size,
                                )
                                if matched_anyres_max_num_patches:
                                    max_num_patches = int(
                                        matched_anyres_max_num_patches.group(1)
                                    )
                            if (
                                image_aspect_ratio == "anyres"
                                or "anyres_max" in image_aspect_ratio
                            ):
                                vision_tower_image_size = self.image_size
                                try:
                                    num_patch_width, num_patch_height = (
                                        get_anyres_image_grid_shape(
                                            image_sizes[image_idx][0],
                                            self.config.image_grid_pinpoints,
                                            vision_tower_image_size,
                                        )
                                    )
                                except Exception as e:
                                    print(f"Error: {e}")
                                    num_patch_width, num_patch_height = 2, 2
                                image_feature = image_feature.view(
                                    num_patch_height, num_patch_width, height, width, -1
                                )
                            else:
-                                raise NotImplementedError()
+                                image_feature = image_feature.view(
                                    2, 2, height, width, -1
                                )
                            # (
                            #     num_patch_width,
                            #     num_patch_height,
                            # ) = get_anyres_image_grid_shape(
                            #     image_sizes[image_idx][0],
                            #     self.image_grid_pinpoints,
                            #     self.vision_tower.config.image_size,
                            # )
                            # image_feature = image_feature.view(
                            #     num_patch_height, num_patch_width, height, width, -1
                            # )
                            if "unpad" in self.mm_patch_merge_type:
                                unit = image_feature.shape[2]
                                image_feature = image_feature.permute(
                                    4, 0, 2, 1, 3
                                ).contiguous()
@@ -191,8 +267,23 @@ class LlavaLlamaForCausalLM(nn.Module):
                                    2, 3
                                )
                                image_feature = unpad_image(
-                                    image_feature, image_sizes[image_idx]
+                                    image_feature, image_sizes[image_idx][0]
                                )
                                if (
                                    "anyres_max" in image_aspect_ratio
                                    and matched_anyres_max_num_patches
                                ):
                                    c, h, w = image_feature.shape
                                    times = math.sqrt(
                                        h * w / (max_num_patches * unit**2)
                                    )
                                    if times > 1.1:
                                        image_feature = image_feature[None]
                                        image_feature = nn.functional.interpolate(
                                            image_feature,
                                            [int(h // times), int(w // times)],
                                            mode="bilinear",
                                        )[0]
                                image_feature = torch.cat(
                                    (
                                        image_feature,
@@ -213,16 +304,31 @@ class LlavaLlamaForCausalLM(nn.Module):
                            image_feature = torch.cat(
                                (base_image_feature, image_feature), dim=0
                            )
                            image_feature = image_feature.unsqueeze(0)
                        else:
-                            image_feature = image_feature[0]
+                            if image_feature.shape[0] > 16:  # video
-                            if "unpad" in self.mm_patch_merge_type:
+                                # 2x2 pooling
-                                image_feature = torch.cat(
+                                num_of_frames = image_feature.shape[0]
-                                    (
+                                image_feature = image_feature.view(
-                                        image_feature,
+                                    num_of_frames, height, width, -1
                                        self.language_model.model.image_newline[None],
                                    ),
                                    dim=0,
                                )
                                image_feature = image_feature.permute(
                                    0, 3, 1, 2
                                ).contiguous()  # N, C, H, W
                                height, weight = image_feature.shape[2:]
                                scaled_shape = [
                                    math.ceil(height / 2),
                                    math.ceil(weight / 2),
                                ]
                                image_feature = nn.functional.interpolate(
                                    image_feature, size=scaled_shape, mode="bilinear"
                                )
                                image_feature = (
                                    image_feature.flatten(2)
                                    .transpose(1, 2)
                                    .contiguous()
                                )  # N, C, H*W
                        new_image_features.append(image_feature)
                    image_features = new_image_features
@@ -233,21 +339,22 @@ class LlavaLlamaForCausalLM(nn.Module):
                        continue
                    start_idx = extend_start_loc_cpu[i]
-                    pad_len, pad_dim = image_features[pt].shape  # 576, 4096
+                    pad_dim = image_features[pt].shape[-1]  # 576, 4096
                    dim = input_embeds.shape[1]
                    assert (
                        pad_dim == dim
                    ), "invalid pad_dim={}, input_embed_dim={}!".format(pad_dim, dim)
                    # Fill in the placeholder for the image
                    try:
-                        input_embeds[
+                        for j, image_off in enumerate(image_offsets[i]):
-                            start_idx
+                            # print("actual image_features length: ", image_features[pt][j].shape[0])
-                            + image_offsets[i] : start_idx
+                            pad_len = image_features[pt][j].shape[0]
-                            + image_offsets[i]
+                            input_embeds[
-                            + pad_len
+                                start_idx + image_off : start_idx + image_off + pad_len
-                        ] = image_features[pt]
+                            ] = image_features[pt][j]
                    except RuntimeError as e:
                        print(f"RuntimeError in llava image encoding: {e}")
                        print(image_features[pt].shape)
                        print(input_embeds.shape)
                        print(start_idx, image_offsets[i])
                    pt += 1
@@ -262,9 +369,16 @@ class LlavaLlamaForCausalLM(nn.Module):
        # load clip vision model by cfg['mm_vision_tower']:
        #   huggingface_name or path_of_clip_relative_to_llava_model_dir
        vision_path = self.config.mm_vision_tower
-        self.vision_tower = CLIPVisionModel.from_pretrained(
+        if "clip" in vision_path:
-            vision_path, torch_dtype=torch.float16
+            self.vision_tower = CLIPVisionModel.from_pretrained(
-        ).cuda()
+                vision_path, torch_dtype=torch.float16
            ).cuda()
        elif "siglip" in vision_path:
            self.vision_tower = SiglipVisionModel.from_pretrained(
                vision_path, torch_dtype=torch.float16
            ).cuda()
            # Siglip needs all feature tokens
            self.config.mm_vision_select_feature = "full"
        self.vision_tower.eval()
        self.vision_feature_layer = self.config.mm_vision_select_layer
@@ -276,8 +390,11 @@ class LlavaLlamaForCausalLM(nn.Module):
        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
-        self.image_feature_len = int((self.image_size / self.patch_size) ** 2)
+        self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
-        if self.vision_feature_select_strategy == "patch":
+        if (
            self.vision_feature_select_strategy == "patch"
            or self.vision_feature_select_strategy == "full"
        ):
            pass
        elif self.vision_feature_select_strategy == "cls_patch":
            self.image_feature_len += 1
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -1,17 +1,27 @@
 import base64
 import io
 import json
 import os
 import sys
 import time
 import unittest
 import numpy as np
 import openai
 import requests
 from decord import VideoReader, cpu
 from PIL import Image
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server
 # python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tokenizer-path lmms-lab/llavanext-qwen-siglip-tokenizer --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384
 class TestOpenAIVisionServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
+        cls.model = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
        cls.base_url = DEFAULT_URL_FOR_UNIT_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
@@ -21,9 +31,11 @@ class TestOpenAIVisionServer(unittest.TestCase):
            api_key=cls.api_key,
            other_args=[
                "--chat-template",
-                "vicuna_v1.1",
+                "chatml-llava",
                "--tokenizer-path",
-                "llava-hf/llava-1.5-7b-hf",
+                "lmms-lab/llavanext-qwen-siglip-tokenizer",
                "--chunked-prefill-size",
                "16384",
                "--log-requests",
            ],
        )
@@ -68,6 +80,81 @@ class TestOpenAIVisionServer(unittest.TestCase):
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0
    def prepare_video_messages(self, video_path):
        max_frames_num = 32
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frame_num = len(vr)
        uniform_sampled_frames = np.linspace(
            0, total_frame_num - 1, max_frames_num, dtype=int
        )
        frame_idx = uniform_sampled_frames.tolist()
        frames = vr.get_batch(frame_idx).asnumpy()
        base64_frames = []
        for frame in frames:
            pil_img = Image.fromarray(frame)
            buff = io.BytesIO()
            pil_img.save(buff, format="JPEG")
            base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
            base64_frames.append(base64_str)
        messages = [{"role": "user", "content": []}]
        frame_format = {
            "type": "image_url",
            "image_url": {"url": "data:image/jpeg;base64,{}"},
        }
        for base64_frame in base64_frames:
            frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format(
                base64_frame
            )
            messages[0]["content"].append(frame_format.copy())
        prompt = {"type": "text", "text": "Please describe the video in detail."}
        messages[0]["content"].append(prompt)
        return messages
    def test_video_chat_completion(self):
        url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
        cache_dir = os.path.expanduser("~/.cache")
        file_path = os.path.join(cache_dir, "jobs.mp4")
        os.makedirs(cache_dir, exist_ok=True)
        if not os.path.exists(file_path):
            response = requests.get(url)
            response.raise_for_status()
            with open(file_path, "wb") as f:
                f.write(response.content)
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
        messages = self.prepare_video_messages(file_path)
        start_time = time.time()
        video_request = client.chat.completions.create(
            model="default",
            messages=messages,
            temperature=0,
            max_tokens=1024,
            stream=True,
        )
        print("-" * 30)
        video_response = ""
        for chunk in video_request:
            if chunk.choices[0].delta.content is not None:
                content = chunk.choices[0].delta.content
                video_response += content
                sys.stdout.write(content)
                sys.stdout.flush()
        print("-" * 30)
        # Add assertions to validate the video response
        self.assertIsNotNone(video_response)
        self.assertGreater(len(video_response), 0)
    def test_regex(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)