From 1e86457c90b01b427eb055e5174d307bec825fb7 Mon Sep 17 00:00:00 2001 From: Mick Date: Tue, 25 Mar 2025 11:08:40 +0800 Subject: [PATCH] model: Minicpmo (#3023) --- benchmark/mmmu/bench_hf.py | 31 +- docs/references/supported_models.md | 8 +- python/pyproject.toml | 1 + python/sglang/lang/chat_template.py | 27 +- python/sglang/srt/configs/model_config.py | 13 +- python/sglang/srt/conversation.py | 30 + python/sglang/srt/managers/image_processor.py | 61 - .../srt/managers/image_processors/minicpmv.py | 129 -- python/sglang/srt/managers/io_struct.py | 14 +- python/sglang/srt/managers/mm_utils.py | 144 +- .../srt/managers/multimodal_processor.py | 68 + .../base_processor.py} | 118 +- .../deepseek_vl_v2.py | 36 +- .../gemma3.py | 20 +- .../janus_pro.py | 27 +- .../llava.py | 16 +- .../managers/multimodal_processors/minicpm.py | 167 ++ .../mlama.py | 10 +- .../qwen_vl.py | 22 +- python/sglang/srt/managers/schedule_batch.py | 53 +- python/sglang/srt/managers/scheduler.py | 12 +- .../sglang/srt/managers/session_controller.py | 2 +- .../sglang/srt/managers/tokenizer_manager.py | 19 +- .../srt/model_executor/forward_batch_info.py | 63 +- .../sglang/srt/models/deepseek_janus_pro.py | 9 +- python/sglang/srt/models/deepseek_vl2.py | 8 +- python/sglang/srt/models/gemma3_mm.py | 13 +- python/sglang/srt/models/llama.py | 2 +- python/sglang/srt/models/llava.py | 6 +- python/sglang/srt/models/llavavid.py | 6 +- python/sglang/srt/models/minicpmo.py | 1995 +++++++++++++++++ python/sglang/srt/models/minicpmv.py | 34 +- python/sglang/srt/models/mllama.py | 8 +- python/sglang/srt/models/qwen2_5_vl.py | 9 +- python/sglang/srt/models/qwen2_vl.py | 15 +- python/sglang/srt/openai_api/adapter.py | 10 +- python/sglang/srt/openai_api/protocol.py | 13 +- python/sglang/srt/utils.py | 35 +- test/srt/test_vision_openai_server.py | 118 +- test/srt/test_vlm_accuracy.py | 27 +- 40 files changed, 2906 insertions(+), 493 deletions(-) delete mode 100644 python/sglang/srt/managers/image_processor.py delete mode 100644 python/sglang/srt/managers/image_processors/minicpmv.py create mode 100644 python/sglang/srt/managers/multimodal_processor.py rename python/sglang/srt/managers/{image_processors/base_image_processor.py => multimodal_processors/base_processor.py} (67%) rename python/sglang/srt/managers/{image_processors => multimodal_processors}/deepseek_vl_v2.py (74%) rename python/sglang/srt/managers/{image_processors => multimodal_processors}/gemma3.py (82%) rename python/sglang/srt/managers/{image_processors => multimodal_processors}/janus_pro.py (77%) rename python/sglang/srt/managers/{image_processors => multimodal_processors}/llava.py (92%) create mode 100644 python/sglang/srt/managers/multimodal_processors/minicpm.py rename python/sglang/srt/managers/{image_processors => multimodal_processors}/mlama.py (86%) rename python/sglang/srt/managers/{image_processors => multimodal_processors}/qwen_vl.py (91%) create mode 100644 python/sglang/srt/models/minicpmo.py diff --git a/benchmark/mmmu/bench_hf.py b/benchmark/mmmu/bench_hf.py index 60bc15bc2..2a5078a37 100644 --- a/benchmark/mmmu/bench_hf.py +++ b/benchmark/mmmu/bench_hf.py @@ -1,5 +1,6 @@ import argparse +import PIL.Image import torch from data_utils import save_json from eval_utils import ( @@ -10,22 +11,38 @@ from eval_utils import ( process_result, ) from tqdm import tqdm -from transformers import AutoModelForImageTextToText, AutoProcessor, GenerationConfig +from transformers import AutoModel, AutoProcessor, GenerationConfig @torch.no_grad() def eval_mmmu(args): eval_args = EvalArgs.from_cli_args(args) + try: + from transformers import AutoModelForImageTextToText + + model = AutoModelForImageTextToText.from_pretrained( + args.model_path, + torch_dtype="auto", + trust_remote_code=True, + ) + except Exception as first_exception: + try: + model = AutoModel.from_pretrained( + args.model_path, + torch_dtype="auto", + trust_remote_code=True, + init_tts=False, + ) + except Exception as second_exception: + raise RuntimeError( + f"Failed to load model: First attempt failed with {first_exception}, " + f"second attempt failed with {second_exception}" + ) from second_exception - model = AutoModelForImageTextToText.from_pretrained( - args.model_path, - torch_dtype="auto", - trust_remote_code=True, - ) model = model.eval().cuda() processor = AutoProcessor.from_pretrained( - args.model_path, torch_dtype="auto", device_map="auto" + args.model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True ) samples = prepare_samples(eval_args) diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md index 9eb9c5eff..c77f6af01 100644 --- a/docs/references/supported_models.md +++ b/docs/references/supported_models.md @@ -24,7 +24,7 @@ - InternLM 2 - Exaone 3 - BaiChuan2 -- MiniCPM / MiniCPM 3 / MiniCPMV +- MiniCPM / MiniCPM 3 / MiniCPM-v / MiniCPM-o - XVERSE / XVERSE MoE - SmolLM - GLM-4 @@ -70,9 +70,9 @@ LLM. 1. **Register your new model as multimodal**: Extend `is_multimodal_model` in [ `model_config.py`](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/configs/model_config.py) to return True for your model. -2. **Process Images**: Create a new `ImageProcessor` class that inherits from `BaseImageProcessor` and register this +2. **Process Images**: Define a new `Processor` class that inherits from `BaseProcessor` and register this processor as your model's dedicated processor. See [ - `image_processor.py`](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/image_processor.py) + `multimodal_processor.py`](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/multimodal_processor.py) for more details. 3. **Handle Image Tokens**: Implement a `pad_input_ids` function for your new model, in which image tokens in the prompt should be expanded and replaced with image-hashes, so that SGLang can recognize different images for @@ -80,7 +80,7 @@ LLM. 4. Replace Multi-headed `Attention` of ViT with SGLang's `VisionAttention`. You can refer [Qwen2VL](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/qwen2_vl.py) or other -vLMs. These models demonstrate how to properly handle both visual and textual inputs. +vLMs. These models demonstrate how to properly handle both multimodal and textual inputs. You should test the new vLM locally against hf models. See [`mmmu`](https://github.com/sgl-project/sglang/tree/main/benchmark/mmmu) for an example. diff --git a/python/pyproject.toml b/python/pyproject.toml index e39a4bdb8..736a7dfcb 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -34,6 +34,7 @@ runtime_common = [ "pydantic", "python-multipart", "pyzmq>=25.1.2", + "soundfile==0.13.1", "torchao>=0.7.0", "transformers==4.50.0", "uvicorn", diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py index 8677b99b3..8554d28d0 100644 --- a/python/sglang/lang/chat_template.py +++ b/python/sglang/lang/chat_template.py @@ -15,6 +15,7 @@ class ChatTemplate: role_prefix_and_suffix: Dict[str, Tuple[str, str]] stop_str: List[str] = () image_token: str = "" + audio_token: str = "