[model] support MiniCPM-V 4.0 (#8747)
Signed-off-by: tc-mb <caitianchi@modelbest.cn> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
This commit is contained in:
@@ -54,6 +54,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|||||||
from sglang.srt.model_loader.utils import set_default_torch_dtype
|
from sglang.srt.model_loader.utils import set_default_torch_dtype
|
||||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||||
from sglang.srt.models.idefics2 import Idefics2VisionTransformer
|
from sglang.srt.models.idefics2 import Idefics2VisionTransformer
|
||||||
|
from sglang.srt.models.llama import LlamaConfig, LlamaForCausalLM
|
||||||
from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
|
from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
|
||||||
from sglang.srt.utils import add_prefix, flatten_nested_list
|
from sglang.srt.utils import add_prefix, flatten_nested_list
|
||||||
|
|
||||||
@@ -581,7 +582,7 @@ class MiniCPMBaseModel(nn.Module):
|
|||||||
|
|
||||||
def init_llm(
|
def init_llm(
|
||||||
self,
|
self,
|
||||||
config: Qwen2Config,
|
config: PretrainedConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
prefix: str = "",
|
prefix: str = "",
|
||||||
) -> nn.Module:
|
) -> nn.Module:
|
||||||
@@ -774,7 +775,168 @@ class MiniCPMV2_6(MiniCPMBaseModel):
|
|||||||
return pattern.pad_input_tokens(input_ids, image_inputs)
|
return pattern.pad_input_tokens(input_ids, image_inputs)
|
||||||
|
|
||||||
|
|
||||||
_SUPPORT_VERSION = {(2, 6): MiniCPMV2_6}
|
class MiniCPMV4_0(MiniCPMBaseModel):
|
||||||
|
packed_modules_mapping = {
|
||||||
|
"qkv_proj": [
|
||||||
|
"q_proj",
|
||||||
|
"k_proj",
|
||||||
|
"v_proj",
|
||||||
|
],
|
||||||
|
"gate_up_proj": [
|
||||||
|
"gate_proj",
|
||||||
|
"up_proj",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
# LoRA specific attributes
|
||||||
|
supported_lora_modules = [
|
||||||
|
# vision encoder
|
||||||
|
"fc1",
|
||||||
|
"fc2",
|
||||||
|
"out_proj",
|
||||||
|
# language model
|
||||||
|
"qkv_proj", # same name with vision encoder
|
||||||
|
"o_proj",
|
||||||
|
"gate_up_proj",
|
||||||
|
"down_proj",
|
||||||
|
# resampler
|
||||||
|
"kv_proj",
|
||||||
|
]
|
||||||
|
|
||||||
|
# BitandBytes specific attributes
|
||||||
|
bitsandbytes_stacked_params_mapping = {
|
||||||
|
# shard_name, weight_name, index
|
||||||
|
"q_proj": ("qkv_proj", 0),
|
||||||
|
"k_proj": ("qkv_proj", 1),
|
||||||
|
"v_proj": ("qkv_proj", 2),
|
||||||
|
"gate_proj": ("gate_up_proj", 0),
|
||||||
|
"up_proj": ("gate_up_proj", 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
embedding_modules = {}
|
||||||
|
embedding_padding_modules = []
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: PretrainedConfig,
|
||||||
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
prefix: str = "",
|
||||||
|
):
|
||||||
|
super().__init__(config=config, quant_config=quant_config, prefix=prefix)
|
||||||
|
assert self.version == (4, 0)
|
||||||
|
|
||||||
|
def init_llm(
|
||||||
|
self,
|
||||||
|
config: LlamaConfig,
|
||||||
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
prefix: str = "",
|
||||||
|
) -> nn.Module:
|
||||||
|
return LlamaForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
|
||||||
|
|
||||||
|
def init_vision_module(
|
||||||
|
self,
|
||||||
|
config: PretrainedConfig,
|
||||||
|
quant_config: Optional[QuantizationConfig],
|
||||||
|
prefix: str = "",
|
||||||
|
) -> nn.Module:
|
||||||
|
model = Idefics2VisionTransformer(
|
||||||
|
config=config.vision_config, quant_config=quant_config, prefix=prefix
|
||||||
|
)
|
||||||
|
if self.config.drop_vision_last_layer:
|
||||||
|
model.encoder.layers = model.encoder.layers[:-1]
|
||||||
|
|
||||||
|
setattr(model, "embed_dim", model.embeddings.embed_dim)
|
||||||
|
setattr(model, "patch_size", model.embeddings.patch_size)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def init_resampler(
|
||||||
|
self,
|
||||||
|
embed_dim: int,
|
||||||
|
vision_dim: int,
|
||||||
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
prefix: str = "",
|
||||||
|
) -> nn.Module:
|
||||||
|
with set_default_torch_dtype(torch.float16):
|
||||||
|
# The resampler in 2.6 remains consistent with the one in 2.5.
|
||||||
|
resampler = Resampler2_5(
|
||||||
|
num_queries=self.config.query_num,
|
||||||
|
embed_dim=embed_dim,
|
||||||
|
num_heads=embed_dim // 128,
|
||||||
|
kv_dim=vision_dim,
|
||||||
|
quant_config=quant_config,
|
||||||
|
prefix=prefix,
|
||||||
|
)
|
||||||
|
|
||||||
|
return resampler.to(device="cuda", dtype=torch.get_default_dtype())
|
||||||
|
|
||||||
|
def get_vision_embedding(
|
||||||
|
self,
|
||||||
|
pixel_values: List[torch.Tensor],
|
||||||
|
patch_attn_mask: Optional[torch.Tensor] = None,
|
||||||
|
tgt_sizes: Optional[torch.Tensor] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
vision_embedding = self.vpm(
|
||||||
|
pixel_values,
|
||||||
|
patch_attention_mask=patch_attn_mask,
|
||||||
|
tgt_sizes=tgt_sizes,
|
||||||
|
)
|
||||||
|
return vision_embedding
|
||||||
|
|
||||||
|
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
|
||||||
|
# list of tensors
|
||||||
|
pixel_values = flatten_nested_list([item.feature for item in items])
|
||||||
|
tgt_sizes = torch.stack(
|
||||||
|
flatten_nested_list([item.tgt_size for item in items]), dim=0
|
||||||
|
)
|
||||||
|
assert len(pixel_values) == tgt_sizes.shape[0]
|
||||||
|
|
||||||
|
device = self.vpm.embeddings.position_embedding.weight.device
|
||||||
|
dtype = self.vpm.embeddings.position_embedding.weight.dtype
|
||||||
|
all_pixel_values_lst = [
|
||||||
|
i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
|
||||||
|
]
|
||||||
|
|
||||||
|
max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
|
||||||
|
assert isinstance(max_patches, int)
|
||||||
|
all_pixel_values = torch.nn.utils.rnn.pad_sequence(
|
||||||
|
all_pixel_values_lst, batch_first=True, padding_value=0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
B, L, _ = all_pixel_values.shape
|
||||||
|
all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
|
||||||
|
patch_attn_mask = torch.zeros(
|
||||||
|
(B, 1, max_patches), dtype=torch.bool, device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
|
||||||
|
mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
|
||||||
|
patch_attn_mask[:, 0, :] = torch.arange(
|
||||||
|
patch_attn_mask.size(2), device=patch_attn_mask.device
|
||||||
|
).unsqueeze(0) < mask_shapes.unsqueeze(1)
|
||||||
|
|
||||||
|
vision_embedding = self.vpm(
|
||||||
|
all_pixel_values.type(dtype),
|
||||||
|
patch_attention_mask=patch_attn_mask,
|
||||||
|
tgt_sizes=tgt_sizes,
|
||||||
|
)
|
||||||
|
return self.resampler(vision_embedding, tgt_sizes)
|
||||||
|
|
||||||
|
def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
|
||||||
|
# Get all special token IDs
|
||||||
|
im_start_id: int = image_inputs.im_start_id
|
||||||
|
im_end_id: int = image_inputs.im_end_id
|
||||||
|
slice_start_id: int = image_inputs.slice_start_id
|
||||||
|
slice_end_id: int = image_inputs.slice_end_id
|
||||||
|
|
||||||
|
media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
|
||||||
|
pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
|
||||||
|
|
||||||
|
return pattern.pad_input_tokens(input_ids, image_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
_SUPPORT_VERSION = {
|
||||||
|
(2, 6): MiniCPMV2_6,
|
||||||
|
(4, 0): MiniCPMV4_0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class MiniCPMV:
|
class MiniCPMV:
|
||||||
@@ -809,7 +971,7 @@ class MiniCPMV:
|
|||||||
# Dispatch class based on version
|
# Dispatch class based on version
|
||||||
instance_class = _SUPPORT_VERSION.get(version)
|
instance_class = _SUPPORT_VERSION.get(version)
|
||||||
if instance_class is None:
|
if instance_class is None:
|
||||||
raise ValueError("Currently, MiniCPMV only supports versions 2.6")
|
raise ValueError("Currently, MiniCPMV only supports versions 2.6 and 4.0")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
minicpmv = instance_class(
|
minicpmv = instance_class(
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ class TestCompressedTensorsLlama3FP8(CustomTestCase):
|
|||||||
)
|
)
|
||||||
metrics = run_eval(args)
|
metrics = run_eval(args)
|
||||||
print(f"{metrics=}")
|
print(f"{metrics=}")
|
||||||
self.assertGreater(metrics["accuracy"], 0.45)
|
self.assertGreaterEqual(metrics["accuracy"], 0.45)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -165,6 +165,27 @@ class TestMinicpmvServer(ImageOpenAITestMixin):
|
|||||||
cls.base_url += "/v1"
|
cls.base_url += "/v1"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMinicpmv4Server(ImageOpenAITestMixin):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.model = "openbmb/MiniCPM-V-4"
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.api_key = "sk-123456"
|
||||||
|
cls.process = popen_launch_server(
|
||||||
|
cls.model,
|
||||||
|
cls.base_url,
|
||||||
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
other_args=[
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--mem-fraction-static",
|
||||||
|
"0.35",
|
||||||
|
"--cuda-graph-max-bs",
|
||||||
|
"4",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
cls.base_url += "/v1"
|
||||||
|
|
||||||
|
|
||||||
class TestInternVL2_5Server(ImageOpenAITestMixin):
|
class TestInternVL2_5Server(ImageOpenAITestMixin):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
@@ -184,7 +205,7 @@ class TestInternVL2_5Server(ImageOpenAITestMixin):
|
|||||||
cls.base_url += "/v1"
|
cls.base_url += "/v1"
|
||||||
|
|
||||||
|
|
||||||
class TestMinicpmoServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
|
class TestMinicpmo2_6Server(ImageOpenAITestMixin, AudioOpenAITestMixin):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = "openbmb/MiniCPM-o-2_6"
|
cls.model = "openbmb/MiniCPM-o-2_6"
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
|
|||||||
return self.model_runner.model
|
return self.model_runner.model
|
||||||
|
|
||||||
|
|
||||||
class TestMiniCPMVLogits(VisionLLMLogitsBase):
|
class TestMiniCPMV2_6Logits(VisionLLMLogitsBase):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
super().setUpClass()
|
super().setUpClass()
|
||||||
@@ -265,3 +265,60 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.compare_outputs(sglang_output, hf_output)
|
self.compare_outputs(sglang_output, hf_output)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMiniCPMV4Logits(VisionLLMLogitsBase):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
cls.model_path = "openbmb/MiniCPM-V-4"
|
||||||
|
cls.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
cls.model_path, trust_remote_code=True
|
||||||
|
)
|
||||||
|
cls.processor = AutoProcessor.from_pretrained(
|
||||||
|
cls.model_path, trust_remote_code=True
|
||||||
|
)
|
||||||
|
cls.chat_template = "minicpmv"
|
||||||
|
|
||||||
|
cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
cls.hf_model = (
|
||||||
|
AutoModel.from_pretrained(
|
||||||
|
cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
|
||||||
|
)
|
||||||
|
.eval()
|
||||||
|
.to(cls.device)
|
||||||
|
)
|
||||||
|
init_embedding_cache()
|
||||||
|
|
||||||
|
async def test_vlm_embedding_output(self):
|
||||||
|
"""
|
||||||
|
Compares the embedding output of vlm
|
||||||
|
"""
|
||||||
|
inputs = self.get_processor_output()
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
# hf
|
||||||
|
model_inputs = {
|
||||||
|
"input_ids": inputs.input_ids,
|
||||||
|
"image_bound": inputs.image_bound,
|
||||||
|
"pixel_values": inputs.pixel_values,
|
||||||
|
"tgt_sizes": inputs.tgt_sizes,
|
||||||
|
}
|
||||||
|
hf_output = self.hf_model.get_input_embeddings()(inputs.input_ids)
|
||||||
|
|
||||||
|
# sglang
|
||||||
|
model = self.get_model()
|
||||||
|
sglang_output = self.vlm_func(
|
||||||
|
model,
|
||||||
|
input_ids=inputs.input_ids.to(self.device),
|
||||||
|
pixel_values=inputs.pixel_values,
|
||||||
|
image_bound=inputs.image_bound.to(self.device),
|
||||||
|
tgt_sizes=inputs.tgt_sizes.to(self.device),
|
||||||
|
input_embedding=model.get_input_embeddings(),
|
||||||
|
multimodal_model=model,
|
||||||
|
placeholder_tokens={
|
||||||
|
Modality.IMAGE: self.processor.tokenizer.unk_token_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.compare_outputs(sglang_output, hf_output)
|
||||||
|
|||||||
Reference in New Issue
Block a user