Support precomputed multimodal features for Qwen-VL and Gemma3 models. (#6136)

Co-authored-by: Yury Sulsky <ysulsky@tesla.com>
This commit is contained in:
Yury Sulsky
2025-05-16 12:26:15 -07:00
committed by GitHub
parent c23a7072b6
commit f19a9204cd
14 changed files with 592 additions and 125 deletions

View File

@@ -3,15 +3,22 @@
import unittest
from io import BytesIO
from typing import List
from typing import List, Optional
import numpy as np
import requests
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import AutoModel, AutoProcessor, AutoTokenizer
from transformers import (
AutoModel,
AutoProcessor,
AutoTokenizer,
Gemma3ForConditionalGeneration,
Qwen2_5_VLForConditionalGeneration,
)
from sglang import Engine
from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.conversation import generate_chat_conv
from sglang.srt.managers.mm_utils import embed_mm_inputs
@@ -100,7 +107,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
np.testing.assert_allclose(hf_np, sg_np)
def get_processor_output(self):
def get_completion_request(self) -> ChatCompletionRequest:
json_str = f"""
{{
"model": "{self.model_path}",
@@ -124,10 +131,12 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
}}
"""
req = ChatCompletionRequest.model_validate_json(json_str)
return ChatCompletionRequest.model_validate_json(json_str)
def get_processor_output(self, req: Optional[ChatCompletionRequest] = None):
if req is None:
req = self.get_completion_request()
conv = generate_chat_conv(req, template_name=self.chat_template)
text = conv.get_prompt()
# Process inputs using processor
@@ -239,5 +248,129 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
self.compare_outputs(sglang_output, hf_output)
class TestQwenVLUnderstandsImage(VisionLLMLogitsBase):
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
cls.chat_template = "qwen2-vl"
cls.processor = AutoProcessor.from_pretrained(
cls.model_path, trust_remote_code=True, use_fast=True
)
cls.visual = (
Qwen2_5_VLForConditionalGeneration.from_pretrained(
cls.model_path, torch_dtype=torch.bfloat16
)
.eval()
.visual.to(cls.device)
)
def setUp(self):
self.engine = Engine(
model_path=self.model_path,
chat_template=self.chat_template,
device=self.device.type,
mem_fraction_static=0.8,
)
def tearDown(self):
self.engine.shutdown()
async def test_qwen_vl_understands_image(self):
req = self.get_completion_request()
conv = generate_chat_conv(req, template_name=self.chat_template)
text = conv.get_prompt()
output = await self.engine.async_generate(
prompt=text,
image_data=[self.main_image],
sampling_params=dict(temperature=0.0),
)
self.assertIn("taxi", output["text"].lower())
async def test_qwen_vl_understands_precomputed_features(self):
req = self.get_completion_request()
processor_output = self.get_processor_output(req=req)
with torch.inference_mode():
precomputed_features = self.visual(
processor_output["pixel_values"], processor_output["image_grid_thw"]
)
output = await self.engine.async_generate(
input_ids=processor_output["input_ids"][0].detach().cpu().tolist(),
image_data=[
dict(
modality="IMAGE",
image_grid_thws=processor_output["image_grid_thw"],
precomputed_features=precomputed_features,
)
],
sampling_params=dict(temperature=0.0),
)
self.assertIn("taxi", output["text"].lower())
class TestGemmaUnderstandsImage(VisionLLMLogitsBase):
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.model_path = "google/gemma-3-4b-it"
cls.chat_template = "gemma-it"
cls.processor = AutoProcessor.from_pretrained(
cls.model_path, trust_remote_code=True, use_fast=True
)
model = Gemma3ForConditionalGeneration.from_pretrained(
cls.model_path, torch_dtype=torch.bfloat16
)
cls.vision_tower = model.vision_tower.eval().to(cls.device)
cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
@classmethod
def visual(cls, pixel_values):
vision_outputs = cls.vision_tower(pixel_values=pixel_values).last_hidden_state
image_features = cls.mm_projector(vision_outputs)
return image_features
def setUp(self):
self.engine = Engine(
model_path=self.model_path,
chat_template=self.chat_template,
device=self.device.type,
mem_fraction_static=0.5,
enable_multimodal=True,
)
def tearDown(self):
self.engine.shutdown()
async def test_gemma_understands_image(self):
req = self.get_completion_request()
conv = generate_chat_conv(req, template_name=self.chat_template)
text = conv.get_prompt()
output = await self.engine.async_generate(
prompt=text,
image_data=[self.main_image],
sampling_params=dict(temperature=0.0),
)
self.assertIn("taxi", output["text"].lower())
async def test_gemma_understands_precomputed_features(self):
req = self.get_completion_request()
processor_output = self.get_processor_output(req=req)
with torch.inference_mode():
precomputed_features = self.visual(processor_output["pixel_values"])
output = await self.engine.async_generate(
input_ids=processor_output["input_ids"][0].detach().cpu().tolist(),
image_data=[
dict(
modality="IMAGE",
precomputed_features=precomputed_features,
)
],
sampling_params=dict(temperature=0.0),
)
self.assertIn("taxi", output["text"].lower())
if __name__ == "__main__":
unittest.main()