sglang/test/srt/test_vlm_accuracy.py

"""
"""

import unittest
from io import BytesIO
from typing import List, Optional

import numpy as np
import requests
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import AutoModel, AutoProcessor, AutoTokenizer

from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.conversation import generate_chat_conv
from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
from sglang.srt.managers.mm_utils import embed_mm_inputs, init_embedding_cache
from sglang.srt.managers.schedule_batch import (
    Modality,
    MultimodalDataItem,
    MultimodalInputs,
)
from sglang.srt.model_executor.model_runner import ModelRunner
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
from sglang.srt.server_args import ServerArgs


# Test the logits output between HF and SGLang
class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
    @classmethod
    def setUpClass(cls):
        cls.image_url = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        cls.model_path = ""
        cls.chat_template = ""
        cls.processor = ""
        response = requests.get(cls.image_url)
        cls.main_image = Image.open(BytesIO(response.content))

    def compare_outputs(self, sglang_output: torch.Tensor, hf_output: torch.Tensor):
        # Convert to float32 for numerical stability if needed
        hf = hf_output.float()
        sg = sglang_output.float()

        # Basic shape and dtype comparison
        print("\n=== Basic Properties ===")
        print(f"Shapes match: {hf.shape == sg.shape}")
        print(f"HF shape: {hf.shape}, SGLang shape: {sg.shape}")
        print(f"HF dtype: {hf.dtype}, SGLang dtype: {sg.dtype}")

        # Move tensors to CPU for numpy operations
        hf_np = hf.cpu().numpy()
        sg_np = sg.cpu().numpy()

        # Statistical metrics
        print("\n=== Statistical Metrics ===")
        print(f"Mean absolute difference: {torch.mean(torch.abs(hf - sg)).item():.6f}")
        print(f"Max absolute difference: {torch.max(torch.abs(hf - sg)).item():.6f}")
        print(f"Mean squared error: {torch.mean((hf - sg) ** 2).item():.6f}")
        print(
            f"Root mean squared error: {torch.sqrt(torch.mean((hf - sg) ** 2)).item():.6f}"
        )

        # Cosine similarity (across feature dimension)
        cos_sim = F.cosine_similarity(hf, sg)
        print(f"Mean cosine similarity: {torch.mean(cos_sim).item():.6f}")
        print(f"Min cosine similarity: {torch.min(cos_sim).item():.6f}")

        # Find largest absolute differences
        print("\n=== Largest Absolute Differences ===")
        diffs = torch.abs(hf - sg)
        flat_diffs = diffs.flatten()

        # Get indices of top 10 differences
        top_k = 10
        top_values, top_flat_indices = torch.topk(flat_diffs, top_k)

        # Convert flat indices to multidimensional indices
        top_indices = np.unravel_index(top_flat_indices.cpu().numpy(), diffs.shape)

        print(f"\nTop {top_k} largest absolute differences:")
        print(
            "Index".ljust(30)
            + "Difference".ljust(15)
            + "HF Value".ljust(15)
            + "SGLang Value"
        )
        print("-" * 75)

        for i in range(top_k):
            # Get the index tuple for this difference
            idx = tuple(dim[i] for dim in top_indices)
        diff_val = top_values[i].item()
        hf_val = hf[idx].item()
        sg_val = sg[idx].item()

        # Format the index tuple and values
        idx_str = str(idx)
        print(f"{idx_str:<30}{diff_val:<15.6f}{hf_val:<15.6f}{sg_val:.6f}")

        np.testing.assert_allclose(hf_np, sg_np)

    def get_completion_request(self) -> ChatCompletionRequest:
        json_str = f"""
        {{
  "model": "{self.model_path}",
  "messages": [
    {{
      "role": "user",
      "content": [
        {{
          "type": "image_url",
          "image_url": {{
            "url": "{self.image_url}"
          }}
        }},
        {{
          "type": "text",
          "text": "What's in this picture?"
        }}
      ]
    }}
  ]
}}
        """

        return ChatCompletionRequest.model_validate_json(json_str)

    def get_processor_output(self, req: Optional[ChatCompletionRequest] = None):
        if req is None:
            req = self.get_completion_request()
        conv = generate_chat_conv(req, template_name=self.chat_template)
        text = conv.get_prompt()

        # Process inputs using processor
        # FIXME: the formal arguments may differ
        inputs = self.processor(
            text=[text],
            images=[self.main_image],
            return_tensors="pt",
        ).to(self.device)

        return inputs

    def get_sglang_model(self):
        self.model_runner = ModelRunner(
            model_config=ModelConfig(self.model_path, model_override_args="{}"),
            mem_fraction_static=0.8,
            gpu_id=0,
            tp_rank=0,
            tp_size=1,
            pp_rank=0,
            pp_size=1,
            nccl_port=12435,
            server_args=ServerArgs(
                model_path=self.model_path,
                disable_cuda_graph=True,
            ),
        )
        return self.model_runner.model


class TestMiniCPMVLogits(VisionLLMLogitsBase):
    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        cls.model_path = "openbmb/MiniCPM-V-2_6"
        cls.tokenizer = AutoTokenizer.from_pretrained(
            cls.model_path, trust_remote_code=True
        )
        cls.processor = AutoProcessor.from_pretrained(
            cls.model_path, trust_remote_code=True
        )
        cls.chat_template = "minicpmv"

        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        cls.hf_model = (
            AutoModel.from_pretrained(
                cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
            )
            .eval()
            .to(cls.device)
        )
        init_embedding_cache()

    async def test_vlm_embedding_output(self):
        """
        Compares the embedding output of vlm
        """
        inputs = self.get_processor_output()

        with torch.no_grad():
            # hf
            model_inputs = {
                "input_ids": inputs.input_ids,
                "image_bound": inputs.image_bound,
                "pixel_values": inputs.pixel_values,
                "tgt_sizes": inputs.tgt_sizes,
            }
            (hf_output, _) = self.hf_model.get_vllm_embedding(
                model_inputs,
            )
            hf_output = hf_output.squeeze(0)

            # sglang
            model = self.get_sglang_model()
            input_ids = inputs["input_ids"].to(self.device).flatten()

            pixel_values = inputs["pixel_values"]
            tgt_sizes = inputs["tgt_sizes"]
            pixel_values_flat: List[torch.Tensor] = []
            tgt_sizes_flat: List[torch.Tensor] = []
            for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
                # per image
                if len(pixel_b) != len(tgt_b):
                    raise ValueError(
                        "Inconsistent N lengths, found: "
                        f"{len(pixel_b)} vs {len(tgt_b)}"
                    )
                for pixel_n, tgt_n in zip(pixel_b, tgt_b):
                    pixel_values_flat += [pixel_n]
                    tgt_sizes_flat += [tgt_n]

            im_start_id, im_end_id = (
                self.tokenizer.im_start_id,
                self.tokenizer.im_end_id,
            )
            slice_start_id, slice_end_id = (
                self.tokenizer.slice_start_id,
                self.tokenizer.slice_end_id,
            )

            image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
                input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
            )
            slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
                input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
            )
            image_offsets.extend(slice_offsets)
            image_offsets = sorted(image_offsets)

            sglang_output = embed_mm_inputs(
                mm_inputs_list=[
                    MultimodalInputs(
                        mm_items=[
                            MultimodalDataItem(
                                feature=pixel_values_flat,
                                offsets=image_offsets,
                                tgt_size=tgt_sizes_flat,
                                modality=Modality.IMAGE,
                                pad_value=self.processor.tokenizer.unk_token_id,
                            )
                        ]
                    ),
                ],
                extend_prefix_lens=[0],
                extend_seq_lens=[input_ids.shape[0]],
                input_ids=input_ids,
                input_embedding=model.get_input_embeddings(),
                multimodal_model=model,
                placeholder_tokens={
                    Modality.IMAGE: self.processor.tokenizer.unk_token_id,
                },
            )

        self.compare_outputs(sglang_output, hf_output)
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`"""`
			`"""`

			`import unittest`
			`from io import BytesIO`
Support precomputed multimodal features for Qwen-VL and Gemma3 models. (#6136) Co-authored-by: Yury Sulsky <ysulsky@tesla.com> 2025-05-16 12:26:15 -07:00			`from typing import List, Optional`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00
			`import numpy as np`
			`import requests`
			`import torch`
			`import torch.nn.functional as F`
			`from PIL import Image`
vlm: support video as an input modality (#5888) 2025-07-10 14:48:35 +08:00			`from transformers import AutoModel, AutoProcessor, AutoTokenizer`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00
			`from sglang.srt.configs.model_config import ModelConfig`
			`from sglang.srt.conversation import generate_chat_conv`
feat(oai refactor): Replace `openai_api` with `entrypoints/openai` (#7351) Co-authored-by: Jin Pan <jpan236@wisc.edu> 2025-06-21 13:21:06 -07:00			`from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest`
[VLM] Support chunk prefill for VLM (#6355) Co-authored-by: yizhang2077 <1109276519@qq.com> 2025-05-22 20:32:41 -07:00			`from sglang.srt.managers.mm_utils import embed_mm_inputs, init_embedding_cache`
refactor: multimodal data (#4754) 2025-04-01 00:57:51 +08:00			`from sglang.srt.managers.schedule_batch import (`
			`Modality,`
			`MultimodalDataItem,`
			`MultimodalInputs,`
			`)`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`from sglang.srt.model_executor.model_runner import ModelRunner`
Move multimodal processors into a separate folder (#7581) 2025-06-27 11:58:24 -07:00			`from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`from sglang.srt.server_args import ServerArgs`


			`# Test the logits output between HF and SGLang`
			`class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):`
			`@classmethod`
			`def setUpClass(cls):`
			`cls.image_url = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"`
			`cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`
chore: upgrade transformers 4.52.3 (#6575) Co-authored-by: Mick <mickjagger19@icloud.com> 2025-05-25 22:49:58 -07:00			`cls.model_path = ""`
			`cls.chat_template = ""`
			`cls.processor = ""`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`response = requests.get(cls.image_url)`
			`cls.main_image = Image.open(BytesIO(response.content))`

			`def compare_outputs(self, sglang_output: torch.Tensor, hf_output: torch.Tensor):`
			`# Convert to float32 for numerical stability if needed`
			`hf = hf_output.float()`
			`sg = sglang_output.float()`

			`# Basic shape and dtype comparison`
			`print("\n=== Basic Properties ===")`
			`print(f"Shapes match: {hf.shape == sg.shape}")`
			`print(f"HF shape: {hf.shape}, SGLang shape: {sg.shape}")`
			`print(f"HF dtype: {hf.dtype}, SGLang dtype: {sg.dtype}")`

			`# Move tensors to CPU for numpy operations`
			`hf_np = hf.cpu().numpy()`
			`sg_np = sg.cpu().numpy()`

			`# Statistical metrics`
			`print("\n=== Statistical Metrics ===")`
			`print(f"Mean absolute difference: {torch.mean(torch.abs(hf - sg)).item():.6f}")`
			`print(f"Max absolute difference: {torch.max(torch.abs(hf - sg)).item():.6f}")`
			`print(f"Mean squared error: {torch.mean((hf - sg) ** 2).item():.6f}")`
			`print(`
			`f"Root mean squared error: {torch.sqrt(torch.mean((hf - sg) ** 2)).item():.6f}"`
			`)`

			`# Cosine similarity (across feature dimension)`
			`cos_sim = F.cosine_similarity(hf, sg)`
			`print(f"Mean cosine similarity: {torch.mean(cos_sim).item():.6f}")`
			`print(f"Min cosine similarity: {torch.min(cos_sim).item():.6f}")`

			`# Find largest absolute differences`
			`print("\n=== Largest Absolute Differences ===")`
			`diffs = torch.abs(hf - sg)`
			`flat_diffs = diffs.flatten()`

			`# Get indices of top 10 differences`
			`top_k = 10`
			`top_values, top_flat_indices = torch.topk(flat_diffs, top_k)`

			`# Convert flat indices to multidimensional indices`
			`top_indices = np.unravel_index(top_flat_indices.cpu().numpy(), diffs.shape)`

			`print(f"\nTop {top_k} largest absolute differences:")`
			`print(`
			`"Index".ljust(30)`
			`+ "Difference".ljust(15)`
			`+ "HF Value".ljust(15)`
			`+ "SGLang Value"`
			`)`
			`print("-" * 75)`

			`for i in range(top_k):`
			`# Get the index tuple for this difference`
			`idx = tuple(dim[i] for dim in top_indices)`
			`diff_val = top_values[i].item()`
			`hf_val = hf[idx].item()`
			`sg_val = sg[idx].item()`

			`# Format the index tuple and values`
			`idx_str = str(idx)`
			`print(f"{idx_str:<30}{diff_val:<15.6f}{hf_val:<15.6f}{sg_val:.6f}")`

			`np.testing.assert_allclose(hf_np, sg_np)`

Support precomputed multimodal features for Qwen-VL and Gemma3 models. (#6136) Co-authored-by: Yury Sulsky <ysulsky@tesla.com> 2025-05-16 12:26:15 -07:00			`def get_completion_request(self) -> ChatCompletionRequest:`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`json_str = f"""`
			`{{`
			`"model": "{self.model_path}",`
			`"messages": [`
			`{{`
			`"role": "user",`
			`"content": [`
			`{{`
			`"type": "image_url",`
			`"image_url": {{`
			`"url": "{self.image_url}"`
			`}}`
			`}},`
			`{{`
			`"type": "text",`
Add typo checker in pre-commit (#6179) Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca> 2025-05-11 00:55:00 -04:00			`"text": "What's in this picture?"`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`}}`
			`]`
			`}}`
			`]`
			`}}`
			`"""`

Support precomputed multimodal features for Qwen-VL and Gemma3 models. (#6136) Co-authored-by: Yury Sulsky <ysulsky@tesla.com> 2025-05-16 12:26:15 -07:00			`return ChatCompletionRequest.model_validate_json(json_str)`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00
Support precomputed multimodal features for Qwen-VL and Gemma3 models. (#6136) Co-authored-by: Yury Sulsky <ysulsky@tesla.com> 2025-05-16 12:26:15 -07:00			`def get_processor_output(self, req: Optional[ChatCompletionRequest] = None):`
			`if req is None:`
			`req = self.get_completion_request()`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`conv = generate_chat_conv(req, template_name=self.chat_template)`
			`text = conv.get_prompt()`

			`# Process inputs using processor`
			`# FIXME: the formal arguments may differ`
			`inputs = self.processor(`
			`text=[text],`
			`images=[self.main_image],`
			`return_tensors="pt",`
			`).to(self.device)`

			`return inputs`

			`def get_sglang_model(self):`
model: Minicpmo (#3023) 2025-03-25 11:08:40 +08:00			`self.model_runner = ModelRunner(`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`model_config=ModelConfig(self.model_path, model_override_args="{}"),`
			`mem_fraction_static=0.8,`
			`gpu_id=0,`
			`tp_rank=0,`
			`tp_size=1,`
[PP] Add pipeline parallelism (#5724) 2025-04-30 18:18:07 -07:00			`pp_rank=0,`
			`pp_size=1,`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00			`nccl_port=12435,`
			`server_args=ServerArgs(`
			`model_path=self.model_path,`
			`disable_cuda_graph=True,`
			`),`
			`)`
model: Minicpmo (#3023) 2025-03-25 11:08:40 +08:00			`return self.model_runner.model`
[Fix] Address remaining issues of supporting MiniCPMV (#2977) 2025-01-28 16:22:13 +08:00

vlm: support video as an input modality (#5888) 2025-07-10 14:48:35 +08:00			`class TestMiniCPMVLogits(VisionLLMLogitsBase):`
			`@classmethod`
			`def setUpClass(cls):`
			`super().setUpClass()`
			`cls.model_path = "openbmb/MiniCPM-V-2_6"`
			`cls.tokenizer = AutoTokenizer.from_pretrained(`
			`cls.model_path, trust_remote_code=True`
			`)`
			`cls.processor = AutoProcessor.from_pretrained(`
			`cls.model_path, trust_remote_code=True`
			`)`
			`cls.chat_template = "minicpmv"`

			`cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`
			`cls.hf_model = (`
			`AutoModel.from_pretrained(`
			`cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True`
			`)`
			`.eval()`
			`.to(cls.device)`
			`)`
			`init_embedding_cache()`

			`async def test_vlm_embedding_output(self):`
			`"""`
			`Compares the embedding output of vlm`
			`"""`
			`inputs = self.get_processor_output()`

			`with torch.no_grad():`
			`# hf`
			`model_inputs = {`
			`"input_ids": inputs.input_ids,`
			`"image_bound": inputs.image_bound,`
			`"pixel_values": inputs.pixel_values,`
			`"tgt_sizes": inputs.tgt_sizes,`
			`}`
			`(hf_output, _) = self.hf_model.get_vllm_embedding(`
			`model_inputs,`
			`)`
			`hf_output = hf_output.squeeze(0)`

			`# sglang`
			`model = self.get_sglang_model()`
			`input_ids = inputs["input_ids"].to(self.device).flatten()`

			`pixel_values = inputs["pixel_values"]`
			`tgt_sizes = inputs["tgt_sizes"]`
			`pixel_values_flat: List[torch.Tensor] = []`
			`tgt_sizes_flat: List[torch.Tensor] = []`
			`for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):`
			`# per image`
			`if len(pixel_b) != len(tgt_b):`
			`raise ValueError(`
			`"Inconsistent N lengths, found: "`
			`f"{len(pixel_b)} vs {len(tgt_b)}"`
			`)`
			`for pixel_n, tgt_n in zip(pixel_b, tgt_b):`
			`pixel_values_flat += [pixel_n]`
			`tgt_sizes_flat += [tgt_n]`

			`im_start_id, im_end_id = (`
			`self.tokenizer.im_start_id,`
			`self.tokenizer.im_end_id,`
			`)`
			`slice_start_id, slice_end_id = (`
			`self.tokenizer.slice_start_id,`
			`self.tokenizer.slice_end_id,`
			`)`

			`image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(`
			`input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id`
			`)`
			`slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(`
			`input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id`
			`)`
			`image_offsets.extend(slice_offsets)`
			`image_offsets = sorted(image_offsets)`

			`sglang_output = embed_mm_inputs(`
			`mm_inputs_list=[`
			`MultimodalInputs(`
			`mm_items=[`
			`MultimodalDataItem(`
refactor: unify names of the feature field of MultimodalDataItem (#8075) 2025-07-17 08:52:38 +08:00			`feature=pixel_values_flat,`
vlm: support video as an input modality (#5888) 2025-07-10 14:48:35 +08:00			`offsets=image_offsets,`
			`tgt_size=tgt_sizes_flat,`
			`modality=Modality.IMAGE,`
			`pad_value=self.processor.tokenizer.unk_token_id,`
			`)`
			`]`
			`),`
			`],`
			`extend_prefix_lens=[0],`
			`extend_seq_lens=[input_ids.shape[0]],`
			`input_ids=input_ids,`
			`input_embedding=model.get_input_embeddings(),`
			`multimodal_model=model,`
			`placeholder_tokens={`
			`Modality.IMAGE: self.processor.tokenizer.unk_token_id,`
			`},`
			`)`

			`self.compare_outputs(sglang_output, hf_output)`